diff --git a/.gitignore b/.gitignore index 7248435..df19598 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ launchpad-system-flake.sublime-project launchpad-system-flake.sublime-workspace secrets/secrets.nix +flakes/tensorflow/_build/ +flakes/tensorflow/logs +flakes/tensorflow/venv + diff --git a/configuration.nix b/configuration.nix index cf0ce17..a4afc98 100644 --- a/configuration.nix +++ b/configuration.nix @@ -38,6 +38,7 @@ in ./modules/xdg.nix # ./pkgs/app/utilities/bacula.nix + # ./pkgs/app/servers/ff-sync.nix ./pkgs/virtualisation/docker.nix ./pkgs/virtualisation/vbox.nix @@ -78,6 +79,7 @@ in programs.firefox.enable = true; programs.zsh.enable = true; programs.nix-ld.enable = true; + programs.fuse.userAllowOther = true; # programs.nix-ld.libraries = with pkgs; [ # libusb @@ -256,14 +258,15 @@ in # kdePackages.wayland kdePackages.wayland-protocols kdePackages.xwaylandvideobridge kdePackages.kwayland kdePackages.wayqt kdePackages.qtwayland kdePackages.layer-shell-qt # marked broken kdePackages.kwayland-integration kdePackages.kdeplasma-addons kdePackages.qtstyleplugin-kvantum kdePackages.full kdePackages.qtwebengine kdePackages.qtpositioning kdePackages.qtlocation kdePackages.ark kdePackages.dolphin - kdePackages.dolphin-plugins # kdePackages.kwallet kdePackages.kwalletmanager + kdePackages.dolphin-plugins kdePackages.konsole # kdePackages.qtwayland kdePackages.qtsvg kdePackages.kio kdePackages.kio-fuse kdePackages.kio-extras kdePackages.kio-admin kdePackages.kdenetwork-filesharing kdePackages.kwallet kdePackages.kwalletmanager kdePackages.kwallet-pam - kdePackages.polkit-kde-agent-1 polkit-kde-agent kdePackages.kirigami # hyprpolkitagent polkit - # plasma5Packages.kwallet plasma5Packages.kwalletmanager plasma5Packages.kwallet-pam + kdePackages.polkit-kde-agent-1 kdePackages.kirigami + hyprpolkitagent polkit + gtk2 gnome-themes-extra @@ -274,8 +277,9 @@ in glibc # cmake cpio meson libsForQt5.kwayland libsForQt5.qt5.qtwayland libsForQt5.kwayland-integration # libsForQt5.xwaylandvideobridge - + # libsForQt5.polkit-kde-agent # libsForQt5.qtstyleplugin-kvantum libsForQt5.qt5.qtwayland libsForQt5.kio-extras # libsForQt5.qt5.qtgraphicaleffects + #kio-fuse catppuccin-sddm-corners sddm-astronaut @@ -287,7 +291,6 @@ in tailscale logiops - bacula ]; system.stateVersion = "24.11"; # Did you read the comment? } diff --git a/environment.nix b/environment.nix index 259b2f4..bac594b 100644 --- a/environment.nix +++ b/environment.nix @@ -24,27 +24,26 @@ # SDDM_INITIAL_VT = "VT 7"; # https://discourse.nixos.org/t/guide-to-installing-qt-theme/35523/3 - XDG_CURRENT_DESKTOP = "Hyprland"; - XDG_SESSION_DESKTOP = "Hyprland"; - XDG_SESSION_TYPE = "wayland"; + # XDG_CURRENT_DESKTOP = "Hyprland"; + # XDG_SESSION_DESKTOP = "Hyprland"; + # XDG_SESSION_TYPE = "wayland"; XDG_DATA_HOME = "~/.local/share"; # GDK_BACKEND = "wayland"; GTK_USE_PORTAL = "1"; # QT_QPA_PLATFORM = "wayland;xcb"; - QT_QPA_PLATFORM = "wayland"; - QT_QPA_PLATFORMTHEME = "qt5ct"; - # QT_STYLE_OVERRIDE = "cleanlooks"; - QT_WAYLAND_DISABLE_WINDOWDECORATION = "1"; - QT_AUTO_SCREEN_SCALE_FACTOR = "1"; + # QT_QPA_PLATFORM = "wayland"; + # QT_QPA_PLATFORMTHEME = "qt5ct"; + # # QT_STYLE_OVERRIDE = "cleanlooks"; + # QT_WAYLAND_DISABLE_WINDOWDECORATION = "1"; + # QT_AUTO_SCREEN_SCALE_FACTOR = "1"; QT_SCALE_FACTOR_ROUNDING_POLICY = "RoundPreferFloor"; - MOZ_ENABLE_WAYLAND = "1"; + # MOZ_ENABLE_WAYLAND = "1"; - OZONE_PLATFORM = "wayland"; - ELECTRON_OZONE_PLATFORM_HINT = "wayland"; + # ELECTRON_OZONE_PLATFORM_HINT = "wayland"; APPIMAGELAUNCHER_DISABLE ="1"; - CLUTTER_BACKEND = "wayland"; + # CLUTTER_BACKEND = "wayland"; SDL_VIDEODRIVER = "wayland"; LIBVA_DRIVER_NAME = "nvidia"; diff --git a/flake.lock b/flake.lock index 7414b4c..74750ee 100644 --- a/flake.lock +++ b/flake.lock @@ -189,11 +189,11 @@ ] }, "locked": { - "lastModified": 1744117652, - "narHash": "sha256-t7dFCDl4vIOOUMhEZnJF15aAzkpaup9x4ZRGToDFYWI=", + "lastModified": 1746171682, + "narHash": "sha256-EyXUNSa+H+YvGVuQJP1nZskXAowxKYp79RNUsNdQTj4=", "owner": "nix-community", "repo": "home-manager", - "rev": "b4e98224ad1336751a2ac7493967a4c9f6d9cb3f", + "rev": "50eee705bbdbac942074a8c120e8194185633675", "type": "github" }, "original": { @@ -441,11 +441,11 @@ ] }, "locked": { - "lastModified": 1743912348, - "narHash": "sha256-Ui+kUYOhkQ0wkoJbCPLN/LpYQGp+R5sNVsawZWZsEcQ=", + "lastModified": 1745121923, + "narHash": "sha256-8X9JuDfxAEQlBhB0ARgFj9fbDOlCvPx6AbQ1h2T47/g=", "owner": "youwen5", "repo": "zen-browser-flake", - "rev": "75acddf6a50f3a4dd920a3a7220828605ba91090", + "rev": "02084a38e9dbc4fa17f3474c3e9d43bb7db55799", "type": "github" }, "original": { diff --git a/flakes/tensorflow/events.out.tfevents.1745612293.launchpad.1090060.0.v2 b/flakes/tensorflow/events.out.tfevents.1745612293.launchpad.1090060.0.v2 new file mode 100644 index 0000000..1bff34c Binary files /dev/null and b/flakes/tensorflow/events.out.tfevents.1745612293.launchpad.1090060.0.v2 differ diff --git a/flakes/tensorflow/logs.log b/flakes/tensorflow/logs.log new file mode 100644 index 0000000..1684c9c --- /dev/null +++ b/flakes/tensorflow/logs.log @@ -0,0 +1,67361 @@ +Nov 29 11:13:56 launchpad systemd[1]: Stopping Server for local large language models... +Nov 29 11:13:56 launchpad systemd[1]: ollama.service: Deactivated successfully. +Nov 29 11:13:56 launchpad systemd[1]: Stopped Server for local large language models. +Nov 29 11:13:56 launchpad systemd[1]: ollama.service: Consumed 5.334s CPU time, no IP traffic. +Nov 29 11:13:56 launchpad systemd[1]: Started Server for local large language models. +Nov 29 11:13:56 launchpad ollama[549315]: 2024/11/29 11:13:56 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]" +Nov 29 11:13:56 launchpad ollama[549315]: time=2024-11-29T11:13:56.805-08:00 level=INFO source=images.go:704 msg="total blobs: 18" +Nov 29 11:13:56 launchpad ollama[549315]: time=2024-11-29T11:13:56.806-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0" +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached. +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production. +Nov 29 11:13:56 launchpad ollama[549315]: - using env: export GIN_MODE=release +Nov 29 11:13:56 launchpad ollama[549315]: - using code: gin.SetMode(gin.ReleaseMode) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST /api/pull --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST /api/generate --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST /api/chat --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST /api/embeddings --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST /api/create --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST /api/push --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST /api/copy --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] DELETE /api/delete --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST /api/show --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST /api/blobs/:digest --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] HEAD /api/blobs/:digest --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] GET /api/ps --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST /v1/chat/completions --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] GET / --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] GET /api/tags --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] GET /api/version --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] HEAD / --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] HEAD /api/tags --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] HEAD /api/version --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers) +Nov 29 11:13:56 launchpad ollama[549315]: time=2024-11-29T11:13:56.806-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)" +Nov 29 11:13:56 launchpad ollama[549315]: time=2024-11-29T11:13:56.806-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama4178292014/runners +Nov 29 11:14:00 launchpad ollama[549315]: time=2024-11-29T11:14:00.642-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu cpu_avx cpu_avx2 cuda_v12]" +Nov 29 11:14:00 launchpad ollama[549315]: time=2024-11-29T11:14:00.711-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="10.3 GiB" +Nov 29 11:17:00 launchpad systemd[1]: Stopping Server for local large language models... +Nov 29 11:17:00 launchpad systemd[1]: ollama.service: Deactivated successfully. +Nov 29 11:17:00 launchpad systemd[1]: Stopped Server for local large language models. +Nov 29 11:17:00 launchpad systemd[1]: ollama.service: Consumed 5.182s CPU time, no IP traffic. +-- Boot 47c23a229c4c4528a9dcd93e2546681d -- +Nov 29 11:18:42 launchpad systemd[1]: Started Server for local large language models. +Nov 29 11:18:42 launchpad ollama[1650]: 2024/11/29 11:18:42 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]" +Nov 29 11:18:42 launchpad ollama[1650]: time=2024-11-29T11:18:42.238-08:00 level=INFO source=images.go:704 msg="total blobs: 18" +Nov 29 11:18:42 launchpad ollama[1650]: time=2024-11-29T11:18:42.243-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0" +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached. +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production. +Nov 29 11:18:42 launchpad ollama[1650]: - using env: export GIN_MODE=release +Nov 29 11:18:42 launchpad ollama[1650]: - using code: gin.SetMode(gin.ReleaseMode) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST /api/pull --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST /api/generate --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST /api/chat --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST /api/embeddings --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST /api/create --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST /api/push --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST /api/copy --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] DELETE /api/delete --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST /api/show --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST /api/blobs/:digest --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] HEAD /api/blobs/:digest --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] GET /api/ps --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST /v1/chat/completions --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] GET / --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] GET /api/tags --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] GET /api/version --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] HEAD / --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] HEAD /api/tags --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] HEAD /api/version --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers) +Nov 29 11:18:42 launchpad ollama[1650]: time=2024-11-29T11:18:42.244-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)" +Nov 29 11:18:42 launchpad ollama[1650]: time=2024-11-29T11:18:42.244-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama232997375/runners +Nov 29 11:18:46 launchpad ollama[1650]: time=2024-11-29T11:18:46.244-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu cpu_avx cpu_avx2 cuda_v12]" +Nov 29 11:18:46 launchpad ollama[1650]: time=2024-11-29T11:18:46.322-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.2 GiB" +Nov 29 15:43:43 launchpad systemd[1]: Stopping Server for local large language models... +Nov 29 15:43:43 launchpad systemd[1]: ollama.service: Deactivated successfully. +Nov 29 15:43:43 launchpad systemd[1]: Stopped Server for local large language models. +Nov 29 15:43:43 launchpad systemd[1]: ollama.service: Consumed 5.606s CPU time, 1.1G memory peak, 0B memory swap peak, no IP traffic. +-- Boot f721c7d615d94dafb053feaf5ec77243 -- +Nov 29 15:44:29 launchpad systemd[1]: Started Server for local large language models. +Nov 29 15:44:29 launchpad ollama[1636]: 2024/11/29 15:44:29 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]" +Nov 29 15:44:29 launchpad ollama[1636]: time=2024-11-29T15:44:29.739-08:00 level=INFO source=images.go:704 msg="total blobs: 18" +Nov 29 15:44:29 launchpad ollama[1636]: time=2024-11-29T15:44:29.745-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0" +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached. +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production. +Nov 29 15:44:29 launchpad ollama[1636]: - using env: export GIN_MODE=release +Nov 29 15:44:29 launchpad ollama[1636]: - using code: gin.SetMode(gin.ReleaseMode) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST /api/pull --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST /api/generate --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST /api/chat --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST /api/embeddings --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST /api/create --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST /api/push --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST /api/copy --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] DELETE /api/delete --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST /api/show --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST /api/blobs/:digest --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] HEAD /api/blobs/:digest --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] GET /api/ps --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST /v1/chat/completions --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] GET / --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] GET /api/tags --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] GET /api/version --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] HEAD / --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] HEAD /api/tags --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] HEAD /api/version --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers) +Nov 29 15:44:29 launchpad ollama[1636]: time=2024-11-29T15:44:29.745-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)" +Nov 29 15:44:29 launchpad ollama[1636]: time=2024-11-29T15:44:29.746-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama3406801642/runners +Nov 29 15:44:33 launchpad ollama[1636]: time=2024-11-29T15:44:33.917-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cuda_v12 cpu cpu_avx cpu_avx2]" +Nov 29 15:44:34 launchpad ollama[1636]: time=2024-11-29T15:44:34.008-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.2 GiB" +Dec 01 10:52:29 launchpad ollama[1636]: [GIN] 2024/12/01 - 10:52:29 | 200 | 877.976µs | 127.0.0.1 | HEAD "/" +Dec 01 10:52:29 launchpad ollama[1636]: [GIN] 2024/12/01 - 10:52:29 | 200 | 2.076826ms | 127.0.0.1 | POST "/api/show" +Dec 01 10:52:29 launchpad ollama[1636]: [GIN] 2024/12/01 - 10:52:29 | 200 | 303.403µs | 127.0.0.1 | POST "/api/show" +Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.808-08:00 level=WARN source=types.go:384 msg="invalid option provided" option="" +Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.930-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=35 memory.available="8.1 GiB" memory.required.full="9.1 GiB" memory.required.partial="8.0 GiB" memory.required.kv="1.6 GiB" memory.weights.total="6.8 GiB" memory.weights.repeating="6.6 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB" +Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.930-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=35 memory.available="8.1 GiB" memory.required.full="9.1 GiB" memory.required.partial="8.0 GiB" memory.required.kv="1.6 GiB" memory.weights.total="6.8 GiB" memory.weights.repeating="6.6 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB" +Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.931-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=35 memory.available="8.1 GiB" memory.required.full="9.1 GiB" memory.required.partial="8.0 GiB" memory.required.kv="1.6 GiB" memory.weights.total="6.8 GiB" memory.weights.repeating="6.6 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB" +Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.931-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama3406801642/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 35 --parallel 1 --port 46225" +Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.931-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1 +Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.931-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding" +Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.932-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error" +Dec 01 10:52:29 launchpad ollama[689882]: INFO [main] build info | build=0 commit="unknown" tid="140289154994176" timestamp=1733079149 +Dec 01 10:52:29 launchpad ollama[689882]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140289154994176" timestamp=1733079149 total_threads=16 +Dec 01 10:52:29 launchpad ollama[689882]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46225" tid="140289154994176" timestamp=1733079149 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2) +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 0: general.architecture str = llama +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 1: general.name str = codellama +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 2: llama.context_length u32 = 16384 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 3: llama.embedding_length u32 = 5120 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 4: llama.block_count u32 = 40 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 5: llama.feed_forward_length u32 = 13824 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 7: llama.attention.head_count u32 = 40 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 40 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 10: llama.rope.freq_base f32 = 1000000.000000 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 11: general.file_type u32 = 2 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 12: tokenizer.ggml.model str = llama +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32016] = ["", "", "", "<0x00>", "<... +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32016] = [0.000000, 0.000000, 0.000000, 0.0000... +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32016] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 2 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 18: tokenizer.ggml.unknown_token_id u32 = 0 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv 19: general.quantization_version u32 = 2 +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - type f32: 81 tensors +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - type q4_0: 281 tensors +Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - type q6_K: 1 tensors +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_vocab: mismatch in special tokens definition ( 264/32016 vs 259/32016 ). +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: format = GGUF V2 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: arch = llama +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: vocab type = SPM +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_vocab = 32016 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_merges = 0 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_ctx_train = 16384 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_embd = 5120 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_head = 40 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_head_kv = 40 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_layer = 40 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_rot = 128 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_embd_head_k = 128 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_embd_head_v = 128 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_gqa = 1 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_embd_k_gqa = 5120 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_embd_v_gqa = 5120 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: f_norm_eps = 0.0e+00 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: f_norm_rms_eps = 1.0e-05 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: f_clamp_kqv = 0.0e+00 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: f_logit_scale = 0.0e+00 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_ff = 13824 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_expert = 0 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_expert_used = 0 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: causal attn = 1 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: pooling type = 0 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: rope type = 0 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: rope scaling = linear +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: freq_base_train = 1000000.0 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: freq_scale_train = 1 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_yarn_orig_ctx = 16384 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: rope_finetuned = unknown +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: ssm_d_conv = 0 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: ssm_d_inner = 0 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: ssm_d_state = 0 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: ssm_dt_rank = 0 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: model type = 13B +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: model ftype = Q4_0 +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: model params = 13.02 B +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: model size = 6.86 GiB (4.53 BPW) +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: general.name = codellama +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: BOS token = 1 '' +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: EOS token = 2 '' +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: UNK token = 0 '' +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: LF token = 13 '<0x0A>' +Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: PRE token = 32007 '▁
'
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 01 10:52:29 launchpad ollama[1636]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 01 10:52:29 launchpad ollama[1636]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 01 10:52:29 launchpad ollama[1636]: ggml_cuda_init: found 1 CUDA devices:
+Dec 01 10:52:29 launchpad ollama[1636]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 01 10:52:30 launchpad ollama[1636]: llm_load_tensors: ggml ctx size =    0.37 MiB
+Dec 01 10:52:30 launchpad ollama[1636]: time=2024-12-01T10:52:30.182-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 01 10:52:37 launchpad ollama[1636]: llm_load_tensors: offloading 35 repeating layers to GPU
+Dec 01 10:52:37 launchpad ollama[1636]: llm_load_tensors: offloaded 35/41 layers to GPU
+Dec 01 10:52:37 launchpad ollama[1636]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llm_load_tensors:      CUDA0 buffer size =  5956.84 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: ...................................................................................................
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: n_ctx      = 2048
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: n_batch    = 512
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: n_ubatch   = 512
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: freq_scale = 1
+Dec 01 10:52:37 launchpad ollama[1636]: llama_kv_cache_init:  CUDA_Host KV buffer size =   200.00 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llama_kv_cache_init:      CUDA0 KV buffer size =  1400.00 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: graph nodes  = 1286
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: graph splits = 59
+Dec 01 10:52:38 launchpad ollama[689882]: INFO [main] model loaded | tid="140289154994176" timestamp=1733079158
+Dec 01 10:52:38 launchpad ollama[1636]: time=2024-12-01T10:52:38.205-08:00 level=INFO source=server.go:545 msg="llama runner started in 8.27 seconds"
+Dec 01 10:52:38 launchpad ollama[1636]: [GIN] 2024/12/01 - 10:52:38 | 200 |  8.397271488s |       127.0.0.1 | POST     "/api/chat"
+Dec 01 10:56:18 launchpad ollama[1636]: time=2024-12-01T10:56:18.865-08:00 level=WARN source=types.go:384 msg="invalid option provided" option=""
+Dec 01 10:56:43 launchpad ollama[1636]: [GIN] 2024/12/01 - 10:56:43 | 200 | 25.105856746s |       127.0.0.1 | POST     "/api/chat"
+Dec 01 10:57:46 launchpad ollama[1636]: time=2024-12-01T10:57:46.978-08:00 level=WARN source=types.go:384 msg="invalid option provided" option=""
+Dec 01 11:02:36 launchpad ollama[1636]: [GIN] 2024/12/01 - 11:02:36 | 200 |         4m49s |       127.0.0.1 | POST     "/api/chat"
+Dec 01 11:04:38 launchpad ollama[1636]: time=2024-12-01T11:04:38.157-08:00 level=WARN source=types.go:384 msg="invalid option provided" option=""
+Dec 01 11:05:08 launchpad ollama[1636]: [GIN] 2024/12/01 - 11:05:08 | 200 | 29.954345658s |       127.0.0.1 | POST     "/api/chat"
+Dec 01 18:58:47 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 01 18:58:47 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 01 18:58:47 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 01 18:58:47 launchpad systemd[1]: ollama.service: Consumed 30min 11.253s CPU time, 8.4G memory peak, 0B memory swap peak, received 1.1M IP traffic, sent 1.8M IP traffic.
+-- Boot 8b6024c7dfc34d2e950bf41d40c128b0 --
+Dec 02 07:06:20 launchpad systemd[1]: Started Server for local large language models.
+Dec 02 07:06:20 launchpad ollama[1681]: 2024/12/02 07:06:20 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]"
+Dec 02 07:06:20 launchpad ollama[1681]: time=2024-12-02T07:06:20.241-08:00 level=INFO source=images.go:704 msg="total blobs: 18"
+Dec 02 07:06:20 launchpad ollama[1681]: time=2024-12-02T07:06:20.247-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0"
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
+Dec 02 07:06:20 launchpad ollama[1681]:  - using env:        export GIN_MODE=release
+Dec 02 07:06:20 launchpad ollama[1681]:  - using code:        gin.SetMode(gin.ReleaseMode)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/pull                 --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/generate             --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/chat                 --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/embeddings           --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/create               --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/push                 --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/copy                 --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] DELETE /api/delete               --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/show                 --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] HEAD   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] GET    /api/ps                   --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /v1/chat/completions      --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] GET    /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] GET    /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] GET    /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] HEAD   /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] HEAD   /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] HEAD   /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: time=2024-12-02T07:06:20.247-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)"
+Dec 02 07:06:20 launchpad ollama[1681]: time=2024-12-02T07:06:20.248-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama117859918/runners
+Dec 02 07:06:24 launchpad ollama[1681]: time=2024-12-02T07:06:24.349-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 02 07:06:24 launchpad ollama[1681]: time=2024-12-02T07:06:24.437-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.2 GiB"
+Dec 03 10:46:13 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 03 10:46:13 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 03 10:46:13 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 03 10:46:13 launchpad systemd[1]: ollama.service: Consumed 5.926s CPU time, 1.1G memory peak, 0B memory swap peak, no IP traffic.
+-- Boot 3c3403eb7cf049c0ba9e3cae99d71216 --
+Dec 03 10:47:01 launchpad systemd[1]: Started Server for local large language models.
+Dec 03 10:47:01 launchpad ollama[1659]: 2024/12/03 10:47:01 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]"
+Dec 03 10:47:01 launchpad ollama[1659]: time=2024-12-03T10:47:01.471-08:00 level=INFO source=images.go:704 msg="total blobs: 18"
+Dec 03 10:47:01 launchpad ollama[1659]: time=2024-12-03T10:47:01.476-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0"
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
+Dec 03 10:47:01 launchpad ollama[1659]:  - using env:        export GIN_MODE=release
+Dec 03 10:47:01 launchpad ollama[1659]:  - using code:        gin.SetMode(gin.ReleaseMode)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/pull                 --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/generate             --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/chat                 --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/embeddings           --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/create               --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/push                 --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/copy                 --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] DELETE /api/delete               --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/show                 --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] HEAD   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] GET    /api/ps                   --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /v1/chat/completions      --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] GET    /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] GET    /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] GET    /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] HEAD   /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] HEAD   /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] HEAD   /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: time=2024-12-03T10:47:01.476-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)"
+Dec 03 10:47:01 launchpad ollama[1659]: time=2024-12-03T10:47:01.476-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama2477072148/runners
+Dec 03 10:47:05 launchpad ollama[1659]: time=2024-12-03T10:47:05.404-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu_avx cpu_avx2 cuda_v12 cpu]"
+Dec 03 10:47:05 launchpad ollama[1659]: time=2024-12-03T10:47:05.493-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.2 GiB"
+Dec 03 13:44:28 launchpad ollama[1659]: [GIN] 2024/12/03 - 13:44:28 | 200 |     719.091µs |       127.0.0.1 | HEAD     "/"
+Dec 03 13:44:28 launchpad ollama[1659]: [GIN] 2024/12/03 - 13:44:28 | 200 |    1.573258ms |       127.0.0.1 | POST     "/api/show"
+Dec 03 13:44:28 launchpad ollama[1659]: [GIN] 2024/12/03 - 13:44:28 | 200 |     340.475µs |       127.0.0.1 | POST     "/api/show"
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.661-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.661-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.662-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 43623"
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.662-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.662-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.662-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 13:44:28 launchpad ollama[108055]: INFO [main] build info | build=0 commit="unknown" tid="140027875844096" timestamp=1733262268
+Dec 03 13:44:28 launchpad ollama[108055]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140027875844096" timestamp=1733262268 total_threads=16
+Dec 03 13:44:28 launchpad ollama[108055]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43623" tid="140027875844096" timestamp=1733262268
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.913-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 13:44:28 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 13:44:28 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 13:44:28 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 13:44:28 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 13:44:29 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 13:44:33 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 13:44:33 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 13:44:33 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 13:44:33 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 13:44:33 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 13:44:34 launchpad ollama[1659]: .......................................................................................
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 13:44:34 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 13:44:34 launchpad ollama[108055]: INFO [main] model loaded | tid="140027875844096" timestamp=1733262274
+Dec 03 13:44:34 launchpad ollama[1659]: time=2024-12-03T13:44:34.683-08:00 level=INFO source=server.go:545 msg="llama runner started in 6.02 seconds"
+Dec 03 13:44:34 launchpad ollama[1659]: [GIN] 2024/12/03 - 13:44:34 | 200 |  6.598303492s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 13:44:59 launchpad ollama[1659]: [GIN] 2024/12/03 - 13:44:59 | 200 |  8.241165407s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.325-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.327-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.327-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 36067"
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.327-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.327-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.328-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 13:59:49 launchpad ollama[116260]: INFO [main] build info | build=0 commit="unknown" tid="140501364985856" timestamp=1733263189
+Dec 03 13:59:49 launchpad ollama[116260]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140501364985856" timestamp=1733263189 total_threads=16
+Dec 03 13:59:49 launchpad ollama[116260]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36067" tid="140501364985856" timestamp=1733263189
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.579-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 13:59:49 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 13:59:49 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 13:59:49 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 13:59:49 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 13:59:50 launchpad ollama[1659]: .......................................................................................
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 13:59:50 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 13:59:50 launchpad ollama[116260]: INFO [main] model loaded | tid="140501364985856" timestamp=1733263190
+Dec 03 13:59:50 launchpad ollama[1659]: time=2024-12-03T13:59:50.332-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.01 seconds"
+Dec 03 13:59:55 launchpad ollama[1659]: [GIN] 2024/12/03 - 13:59:55 | 200 |  6.628493344s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 14:11:53 launchpad ollama[1659]: time=2024-12-03T14:11:53.779-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 14:11:53 launchpad ollama[1659]: time=2024-12-03T14:11:53.779-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 14:11:53 launchpad ollama[1659]: time=2024-12-03T14:11:53.780-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 34129"
+Dec 03 14:11:53 launchpad ollama[1659]: time=2024-12-03T14:11:53.780-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 14:11:53 launchpad ollama[1659]: time=2024-12-03T14:11:53.780-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 14:11:53 launchpad ollama[1659]: time=2024-12-03T14:11:53.780-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 14:11:53 launchpad ollama[123232]: INFO [main] build info | build=0 commit="unknown" tid="139942337175552" timestamp=1733263913
+Dec 03 14:11:53 launchpad ollama[123232]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139942337175552" timestamp=1733263913 total_threads=16
+Dec 03 14:11:53 launchpad ollama[123232]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34129" tid="139942337175552" timestamp=1733263913
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 14:11:54 launchpad ollama[1659]: time=2024-12-03T14:11:54.031-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 14:11:54 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 14:11:54 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 14:11:54 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 14:11:54 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: .......................................................................................
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 14:11:54 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 14:11:54 launchpad ollama[123232]: INFO [main] model loaded | tid="139942337175552" timestamp=1733263914
+Dec 03 14:11:54 launchpad ollama[1659]: time=2024-12-03T14:11:54.784-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 03 14:12:03 launchpad ollama[1659]: [GIN] 2024/12/03 - 14:12:03 | 200 | 10.801991113s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 14:16:25 launchpad ollama[1659]: [GIN] 2024/12/03 - 14:16:25 | 200 | 10.381062652s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.088-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.088-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.089-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 37643"
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.089-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.089-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.089-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 14:38:18 launchpad ollama[137351]: INFO [main] build info | build=0 commit="unknown" tid="140408150876160" timestamp=1733265498
+Dec 03 14:38:18 launchpad ollama[137351]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140408150876160" timestamp=1733265498 total_threads=16
+Dec 03 14:38:18 launchpad ollama[137351]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37643" tid="140408150876160" timestamp=1733265498
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.340-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 14:38:18 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 14:38:18 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 14:38:18 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 14:38:18 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 14:38:19 launchpad ollama[1659]: .......................................................................................
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 14:38:19 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 14:38:19 launchpad ollama[137351]: INFO [main] model loaded | tid="140408150876160" timestamp=1733265499
+Dec 03 14:38:19 launchpad ollama[1659]: time=2024-12-03T14:38:19.093-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 03 14:38:23 launchpad ollama[1659]: [GIN] 2024/12/03 - 14:38:23 | 200 |  5.665344191s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.631-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.631-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.631-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 45641"
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.632-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.632-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.632-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 14:56:11 launchpad ollama[146881]: INFO [main] build info | build=0 commit="unknown" tid="140122244763648" timestamp=1733266571
+Dec 03 14:56:11 launchpad ollama[146881]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140122244763648" timestamp=1733266571 total_threads=16
+Dec 03 14:56:11 launchpad ollama[146881]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45641" tid="140122244763648" timestamp=1733266571
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.883-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 14:56:11 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 14:56:11 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 14:56:11 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 14:56:11 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 14:56:12 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 14:56:12 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 14:56:12 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: .......................................................................................
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 14:56:12 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 14:56:12 launchpad ollama[146881]: INFO [main] model loaded | tid="140122244763648" timestamp=1733266572
+Dec 03 14:56:12 launchpad ollama[1659]: time=2024-12-03T14:56:12.635-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 03 14:56:16 launchpad ollama[1659]: [GIN] 2024/12/03 - 14:56:16 | 200 |  5.941380448s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 14:58:25 launchpad ollama[1659]: [GIN] 2024/12/03 - 14:58:25 | 200 |  4.217491423s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 14:59:56 launchpad ollama[1659]: [GIN] 2024/12/03 - 14:59:56 | 200 |  2.892466931s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:03:18 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:03:18 | 200 |  4.368670418s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:04:50 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:04:50 | 200 |  3.542498449s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:06:08 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:06:08 | 200 |  3.802530471s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:07:06 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:07:06 | 200 |  2.983658913s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.564-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.564-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.565-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 44625"
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.565-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.565-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.565-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 15:12:53 launchpad ollama[155772]: INFO [main] build info | build=0 commit="unknown" tid="140519775850496" timestamp=1733267573
+Dec 03 15:12:53 launchpad ollama[155772]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140519775850496" timestamp=1733267573 total_threads=16
+Dec 03 15:12:53 launchpad ollama[155772]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44625" tid="140519775850496" timestamp=1733267573
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.816-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 15:12:53 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 15:12:53 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 15:12:53 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 15:12:53 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 15:12:54 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 15:12:54 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 15:12:54 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: .......................................................................................
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 15:12:54 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 15:12:54 launchpad ollama[155772]: INFO [main] model loaded | tid="140519775850496" timestamp=1733267574
+Dec 03 15:12:54 launchpad ollama[1659]: time=2024-12-03T15:12:54.570-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 03 15:12:58 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:12:58 | 200 |  5.654989835s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.194-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.194-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.194-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 44595"
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.194-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.194-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.194-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 15:21:53 launchpad ollama[160851]: INFO [main] build info | build=0 commit="unknown" tid="140205527814144" timestamp=1733268113
+Dec 03 15:21:53 launchpad ollama[160851]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140205527814144" timestamp=1733268113 total_threads=16
+Dec 03 15:21:53 launchpad ollama[160851]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44595" tid="140205527814144" timestamp=1733268113
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.445-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 15:21:53 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 15:21:53 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 15:21:53 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 15:21:53 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 15:21:54 launchpad ollama[1659]: .......................................................................................
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 15:21:54 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 15:21:54 launchpad ollama[160851]: INFO [main] model loaded | tid="140205527814144" timestamp=1733268114
+Dec 03 15:21:54 launchpad ollama[1659]: time=2024-12-03T15:21:54.450-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.26 seconds"
+Dec 03 15:22:01 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:22:01 | 200 |  8.782713142s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:23:21 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:23:21 | 200 |  4.122703789s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:25:15 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:25:15 | 200 |  5.794661393s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:28:42 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:28:42 | 200 |   4.39913915s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:30:46 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:30:46 | 200 |  6.549918347s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:31:46 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:31:46 | 200 |   6.76762143s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:34:34 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:34:34 | 200 |  7.400947696s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:36:00 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:36:00 | 200 |  7.297590238s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:37:59 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:37:59 | 200 |  6.233302043s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:38:47 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:38:47 | 200 |  5.540288603s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:39:30 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:39:30 | 200 |  7.378510802s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:42:35 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:42:35 | 200 |  8.335320048s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.007-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.007-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.007-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 38289"
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.007-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.007-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.007-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 16:09:47 launchpad ollama[187509]: INFO [main] build info | build=0 commit="unknown" tid="139686128603136" timestamp=1733270987
+Dec 03 16:09:47 launchpad ollama[187509]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139686128603136" timestamp=1733270987 total_threads=16
+Dec 03 16:09:47 launchpad ollama[187509]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38289" tid="139686128603136" timestamp=1733270987
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.258-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 16:09:47 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 16:09:47 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 16:09:47 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 16:09:47 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: .......................................................................................
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 16:09:47 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 16:09:47 launchpad ollama[187509]: INFO [main] model loaded | tid="139686128603136" timestamp=1733270987
+Dec 03 16:09:48 launchpad ollama[1659]: time=2024-12-03T16:09:48.011-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 03 16:09:57 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:09:57 | 200 | 10.877322797s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:14:11 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:14:11 | 200 |  7.633343909s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:15:03 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:15:03 | 200 |  6.904140221s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:16:40 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:16:40 | 200 |  7.851581347s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:18:28 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:18:28 | 200 |   8.12679671s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:21:31 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:21:31 | 200 |  6.485433807s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:23:21 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:23:21 | 200 |  5.711659211s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:26:13 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:26:13 | 200 |   6.95786815s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:28:17 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:28:17 | 200 |  7.713706455s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:29:52 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:29:52 | 200 |  8.731658639s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:30:53 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:30:53 | 200 |   6.95133823s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:17:00 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:17:00 | 200 |       16.01µs |       127.0.0.1 | HEAD     "/"
+Dec 04 09:17:00 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:17:00 | 200 |     358.412µs |       127.0.0.1 | POST     "/api/show"
+Dec 04 09:17:00 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:17:00 | 200 |      327.29µs |       127.0.0.1 | POST     "/api/show"
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.385-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.0 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.386-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.0 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.386-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 42781"
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.386-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.386-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.386-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 04 09:17:01 launchpad ollama[292482]: INFO [main] build info | build=0 commit="unknown" tid="140617243279360" timestamp=1733332621
+Dec 04 09:17:01 launchpad ollama[292482]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140617243279360" timestamp=1733332621 total_threads=16
+Dec 04 09:17:01 launchpad ollama[292482]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42781" tid="140617243279360" timestamp=1733332621
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.638-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 04 09:17:01 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 04 09:17:01 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 04 09:17:01 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 04 09:17:01 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 04 09:17:02 launchpad ollama[1659]: .......................................................................................
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 04 09:17:02 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 04 09:17:02 launchpad ollama[292482]: INFO [main] model loaded | tid="140617243279360" timestamp=1733332622
+Dec 04 09:17:02 launchpad ollama[1659]: time=2024-12-04T09:17:02.392-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.01 seconds"
+Dec 04 09:17:02 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:17:02 | 200 |  1.593119474s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:17:25 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:17:25 | 200 |  3.968985607s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:18:29 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:18:29 | 200 |  1.737632457s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:19:26 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:19:26 | 200 |  2.742525699s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:20:37 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:20:37 | 200 |  4.027589705s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:22:08 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:22:08 | 200 |  4.770886706s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:23:16 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:23:16 | 200 |  4.342790103s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:24:19 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:24:19 | 200 |  5.521583489s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:25:48 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:25:48 | 200 |  5.651255584s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:28:28 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:28:28 | 200 |  3.200879754s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:29:17 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:29:17 | 200 |  4.444700133s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:29:45 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:29:45 | 200 |  6.600032671s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:31:35 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:31:35 | 200 |  6.134691517s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:33:12 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:33:12 | 200 |      15.281µs |       127.0.0.1 | HEAD     "/"
+Dec 04 09:33:12 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:33:12 | 200 |     302.976µs |       127.0.0.1 | POST     "/api/show"
+Dec 04 09:33:12 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:33:12 | 200 |     233.124µs |       127.0.0.1 | POST     "/api/show"
+Dec 04 09:33:12 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:33:12 | 200 |     536.018µs |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:34:43 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:34:43 | 200 |  720.970832ms |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:35:29 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:35:29 | 200 |  2.584235748s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:37:22 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:37:22 | 200 |  4.808000486s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:38:52 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:38:52 | 200 |  3.041747597s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:40:35 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:40:35 | 200 |  7.007546328s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:42:21 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:42:21 | 200 |  9.182585003s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:47:21 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:47:21 | 200 |  8.269648658s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:51:02 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:51:02 | 200 |  7.064598792s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:54:37 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:54:37 | 200 |  7.891324419s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.178-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.1 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.178-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.1 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.179-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 44941"
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.179-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.179-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.179-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 04 10:12:11 launchpad ollama[322719]: INFO [main] build info | build=0 commit="unknown" tid="140313003261952" timestamp=1733335931
+Dec 04 10:12:11 launchpad ollama[322719]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140313003261952" timestamp=1733335931 total_threads=16
+Dec 04 10:12:11 launchpad ollama[322719]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44941" tid="140313003261952" timestamp=1733335931
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.431-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 04 10:12:11 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 04 10:12:11 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 04 10:12:11 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 04 10:12:11 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 04 10:12:12 launchpad ollama[1659]: .......................................................................................
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 04 10:12:12 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 04 10:12:12 launchpad ollama[322719]: INFO [main] model loaded | tid="140313003261952" timestamp=1733335932
+Dec 04 10:12:12 launchpad ollama[1659]: time=2024-12-04T10:12:12.184-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 04 10:12:24 launchpad ollama[1659]: [GIN] 2024/12/04 - 10:12:24 | 200 | 14.291565705s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.616-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.617-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.617-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 40401"
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.617-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.617-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.617-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 04 10:24:09 launchpad ollama[329081]: INFO [main] build info | build=0 commit="unknown" tid="139769284988928" timestamp=1733336649
+Dec 04 10:24:09 launchpad ollama[329081]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139769284988928" timestamp=1733336649 total_threads=16
+Dec 04 10:24:09 launchpad ollama[329081]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40401" tid="139769284988928" timestamp=1733336649
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.868-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 04 10:24:09 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 04 10:24:09 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 04 10:24:09 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 04 10:24:09 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 04 10:24:10 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 04 10:24:10 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 04 10:24:10 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: .......................................................................................
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 04 10:24:10 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 04 10:24:10 launchpad ollama[329081]: INFO [main] model loaded | tid="139769284988928" timestamp=1733336650
+Dec 04 10:24:10 launchpad ollama[1659]: time=2024-12-04T10:24:10.621-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 04 10:24:19 launchpad ollama[1659]: [GIN] 2024/12/04 - 10:24:19 | 200 | 10.579786736s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 10:25:48 launchpad ollama[1659]: [GIN] 2024/12/04 - 10:25:48 | 200 |   9.70843195s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.125-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.125-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.126-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 43823"
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.126-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.126-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.126-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 04 10:40:49 launchpad ollama[337927]: INFO [main] build info | build=0 commit="unknown" tid="140683282182144" timestamp=1733337649
+Dec 04 10:40:49 launchpad ollama[337927]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140683282182144" timestamp=1733337649 total_threads=16
+Dec 04 10:40:49 launchpad ollama[337927]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43823" tid="140683282182144" timestamp=1733337649
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.377-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 04 10:40:49 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 04 10:40:49 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 04 10:40:49 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 04 10:40:49 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 04 10:40:50 launchpad ollama[1659]: .......................................................................................
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 04 10:40:50 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 04 10:40:50 launchpad ollama[337927]: INFO [main] model loaded | tid="140683282182144" timestamp=1733337650
+Dec 04 10:40:50 launchpad ollama[1659]: time=2024-12-04T10:40:50.129-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 04 10:40:59 launchpad ollama[1659]: [GIN] 2024/12/04 - 10:40:59 | 200 | 11.099586731s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 10:51:09 launchpad ollama[1659]: time=2024-12-04T10:51:09.910-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:51:09 launchpad ollama[1659]: time=2024-12-04T10:51:09.910-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:51:09 launchpad ollama[1659]: time=2024-12-04T10:51:09.910-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 46479"
+Dec 04 10:51:09 launchpad ollama[1659]: time=2024-12-04T10:51:09.911-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 04 10:51:09 launchpad ollama[1659]: time=2024-12-04T10:51:09.911-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 04 10:51:09 launchpad ollama[1659]: time=2024-12-04T10:51:09.911-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 04 10:51:09 launchpad ollama[343427]: INFO [main] build info | build=0 commit="unknown" tid="139637822832640" timestamp=1733338269
+Dec 04 10:51:09 launchpad ollama[343427]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139637822832640" timestamp=1733338269 total_threads=16
+Dec 04 10:51:09 launchpad ollama[343427]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46479" tid="139637822832640" timestamp=1733338269
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 04 10:51:10 launchpad ollama[1659]: time=2024-12-04T10:51:10.162-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 04 10:51:10 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 04 10:51:10 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 04 10:51:10 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 04 10:51:10 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: .......................................................................................
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 04 10:51:10 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 04 10:51:10 launchpad ollama[343427]: INFO [main] model loaded | tid="139637822832640" timestamp=1733338270
+Dec 04 10:51:10 launchpad ollama[1659]: time=2024-12-04T10:51:10.914-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 04 10:51:18 launchpad ollama[1659]: [GIN] 2024/12/04 - 10:51:18 | 200 |  9.459881992s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 10:54:55 launchpad ollama[1659]: [GIN] 2024/12/04 - 10:54:55 | 200 |  4.748420486s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.219-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.220-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.220-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 37867"
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.220-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.220-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.220-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 04 11:01:07 launchpad ollama[348732]: INFO [main] build info | build=0 commit="unknown" tid="140276627443712" timestamp=1733338867
+Dec 04 11:01:07 launchpad ollama[348732]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140276627443712" timestamp=1733338867 total_threads=16
+Dec 04 11:01:07 launchpad ollama[348732]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37867" tid="140276627443712" timestamp=1733338867
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.471-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 04 11:01:07 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 04 11:01:07 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 04 11:01:07 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 04 11:01:07 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 04 11:01:08 launchpad ollama[1659]: .......................................................................................
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 04 11:01:08 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 04 11:01:08 launchpad ollama[348732]: INFO [main] model loaded | tid="140276627443712" timestamp=1733338868
+Dec 04 11:01:08 launchpad ollama[1659]: time=2024-12-04T11:01:08.475-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.25 seconds"
+Dec 04 11:01:16 launchpad ollama[1659]: [GIN] 2024/12/04 - 11:01:16 | 200 |  9.608117009s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 11:05:40 launchpad ollama[1659]: [GIN] 2024/12/04 - 11:05:40 | 200 |  6.492649587s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 13:06:01 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 04 13:06:01 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 04 13:06:01 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 04 13:06:01 launchpad systemd[1]: ollama.service: Consumed 6min 51.007s CPU time, received 7.9M IP traffic, sent 8.6M IP traffic.
+-- Boot 8503f579953f4577b468cd97d1b57e4f --
+Dec 04 13:06:44 launchpad systemd[1]: Started Server for local large language models.
+Dec 04 13:06:44 launchpad ollama[1687]: 2024/12/04 13:06:44 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]"
+Dec 04 13:06:44 launchpad ollama[1687]: time=2024-12-04T13:06:44.815-08:00 level=INFO source=images.go:704 msg="total blobs: 18"
+Dec 04 13:06:44 launchpad ollama[1687]: time=2024-12-04T13:06:44.820-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0"
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
+Dec 04 13:06:44 launchpad ollama[1687]:  - using env:        export GIN_MODE=release
+Dec 04 13:06:44 launchpad ollama[1687]:  - using code:        gin.SetMode(gin.ReleaseMode)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/pull                 --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/generate             --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/chat                 --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/embeddings           --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/create               --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/push                 --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/copy                 --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] DELETE /api/delete               --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/show                 --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] HEAD   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] GET    /api/ps                   --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /v1/chat/completions      --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] GET    /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] GET    /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] GET    /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] HEAD   /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] HEAD   /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] HEAD   /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: time=2024-12-04T13:06:44.820-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)"
+Dec 04 13:06:44 launchpad ollama[1687]: time=2024-12-04T13:06:44.821-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama2609597904/runners
+Dec 04 13:06:48 launchpad ollama[1687]: time=2024-12-04T13:06:48.842-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu_avx2 cuda_v12 cpu cpu_avx]"
+Dec 04 13:06:48 launchpad ollama[1687]: time=2024-12-04T13:06:48.930-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.2 GiB"
+Dec 05 08:04:21 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:04:21 | 200 |    1.224376ms |       127.0.0.1 | HEAD     "/"
+Dec 05 08:04:21 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:04:21 | 200 |    2.013787ms |       127.0.0.1 | POST     "/api/show"
+Dec 05 08:04:21 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:04:21 | 200 |     247.916µs |       127.0.0.1 | POST     "/api/show"
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.625-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.1 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.625-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.1 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.626-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 43769"
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.626-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.626-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.626-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 08:04:21 launchpad ollama[177289]: INFO [main] build info | build=0 commit="unknown" tid="140035587289088" timestamp=1733414661
+Dec 05 08:04:21 launchpad ollama[177289]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140035587289088" timestamp=1733414661 total_threads=16
+Dec 05 08:04:21 launchpad ollama[177289]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43769" tid="140035587289088" timestamp=1733414661
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.877-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 08:04:21 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 08:04:21 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 08:04:21 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 08:04:21 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 08:04:26 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 08:04:26 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 08:04:26 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 08:04:26 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 08:04:26 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 08:04:27 launchpad ollama[1687]: .......................................................................................
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 08:04:27 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 08:04:27 launchpad ollama[177289]: INFO [main] model loaded | tid="140035587289088" timestamp=1733414667
+Dec 05 08:04:27 launchpad ollama[1687]: time=2024-12-05T08:04:27.394-08:00 level=INFO source=server.go:545 msg="llama runner started in 5.77 seconds"
+Dec 05 08:04:27 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:04:27 | 200 |  6.345604465s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 08:06:18 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:06:18 | 200 | 12.439577585s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 08:10:26 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:10:26 | 200 |  3.813542223s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.217-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.217-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.217-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 36315"
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.217-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.218-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.218-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 08:21:14 launchpad ollama[186406]: INFO [main] build info | build=0 commit="unknown" tid="140490451046400" timestamp=1733415674
+Dec 05 08:21:14 launchpad ollama[186406]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140490451046400" timestamp=1733415674 total_threads=16
+Dec 05 08:21:14 launchpad ollama[186406]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36315" tid="140490451046400" timestamp=1733415674
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.469-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 08:21:14 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 08:21:14 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 08:21:14 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 08:21:14 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 08:21:15 launchpad ollama[1687]: .......................................................................................
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 08:21:15 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 08:21:15 launchpad ollama[186406]: INFO [main] model loaded | tid="140490451046400" timestamp=1733415675
+Dec 05 08:21:15 launchpad ollama[1687]: time=2024-12-05T08:21:15.222-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 05 08:21:27 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:21:27 | 200 | 14.264161715s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.025-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.025-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.025-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 41411"
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.025-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.025-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.026-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 08:59:22 launchpad ollama[206729]: INFO [main] build info | build=0 commit="unknown" tid="139938871152640" timestamp=1733417962
+Dec 05 08:59:22 launchpad ollama[206729]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139938871152640" timestamp=1733417962 total_threads=16
+Dec 05 08:59:22 launchpad ollama[206729]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41411" tid="139938871152640" timestamp=1733417962
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.276-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 08:59:22 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 08:59:22 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 08:59:22 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 08:59:22 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: .......................................................................................
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 08:59:22 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 08:59:22 launchpad ollama[206729]: INFO [main] model loaded | tid="139938871152640" timestamp=1733417962
+Dec 05 08:59:23 launchpad ollama[1687]: time=2024-12-05T08:59:23.030-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 05 08:59:33 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:59:33 | 200 | 12.304547927s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.563-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.564-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.564-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 38489"
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.565-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.565-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.565-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 09:23:01 launchpad ollama[219369]: INFO [main] build info | build=0 commit="unknown" tid="139747822436352" timestamp=1733419381
+Dec 05 09:23:01 launchpad ollama[219369]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139747822436352" timestamp=1733419381 total_threads=16
+Dec 05 09:23:01 launchpad ollama[219369]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38489" tid="139747822436352" timestamp=1733419381
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.815-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 09:23:01 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 09:23:01 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 09:23:01 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 09:23:01 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 09:23:02 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 09:23:02 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 09:23:02 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: .......................................................................................
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 09:23:02 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 09:23:02 launchpad ollama[219369]: INFO [main] model loaded | tid="139747822436352" timestamp=1733419382
+Dec 05 09:23:02 launchpad ollama[1687]: time=2024-12-05T09:23:02.568-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 05 09:23:14 launchpad ollama[1687]: [GIN] 2024/12/05 - 09:23:14 | 200 | 13.574240459s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.488-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="8.5 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.488-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="8.5 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.489-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 34215"
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.489-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.489-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.489-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 17:25:44 launchpad ollama[1609159]: INFO [main] build info | build=0 commit="unknown" tid="140586951700480" timestamp=1733448344
+Dec 05 17:25:44 launchpad ollama[1609159]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140586951700480" timestamp=1733448344 total_threads=16
+Dec 05 17:25:44 launchpad ollama[1609159]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34215" tid="140586951700480" timestamp=1733448344
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.740-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 17:25:44 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 17:25:44 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 17:25:44 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 17:25:44 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 17:25:45 launchpad ollama[1687]: .......................................................................................
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 17:25:45 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 17:25:45 launchpad ollama[1609159]: INFO [main] model loaded | tid="140586951700480" timestamp=1733448345
+Dec 05 17:25:45 launchpad ollama[1687]: time=2024-12-05T17:25:45.493-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 05 17:25:57 launchpad ollama[1687]: [GIN] 2024/12/05 - 17:25:57 | 200 | 13.478277071s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 17:27:55 launchpad ollama[1687]: [GIN] 2024/12/05 - 17:27:55 | 200 | 10.078664984s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.723-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="8.7 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.723-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="8.7 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.724-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 46521"
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.724-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.724-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.724-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 17:37:11 launchpad ollama[1615259]: INFO [main] build info | build=0 commit="unknown" tid="140303987441664" timestamp=1733449031
+Dec 05 17:37:11 launchpad ollama[1615259]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140303987441664" timestamp=1733449031 total_threads=16
+Dec 05 17:37:11 launchpad ollama[1615259]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46521" tid="140303987441664" timestamp=1733449031
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.975-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 17:37:12 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 17:37:12 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 17:37:12 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 17:37:12 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: .......................................................................................
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 17:37:12 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 17:37:12 launchpad ollama[1615259]: INFO [main] model loaded | tid="140303987441664" timestamp=1733449032
+Dec 05 17:37:12 launchpad ollama[1687]: time=2024-12-05T17:37:12.978-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.25 seconds"
+Dec 05 17:37:21 launchpad ollama[1687]: [GIN] 2024/12/05 - 17:37:21 | 200 | 10.112585412s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.243-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="8.7 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.243-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="8.7 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.244-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 45891"
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.244-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.244-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.244-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 17:57:11 launchpad ollama[1625869]: INFO [main] build info | build=0 commit="unknown" tid="139890847899648" timestamp=1733450231
+Dec 05 17:57:11 launchpad ollama[1625869]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139890847899648" timestamp=1733450231 total_threads=16
+Dec 05 17:57:11 launchpad ollama[1625869]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45891" tid="139890847899648" timestamp=1733450231
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.495-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 17:57:11 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 17:57:11 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 17:57:11 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 17:57:11 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 17:57:12 launchpad ollama[1687]: .......................................................................................
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 17:57:12 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 17:57:12 launchpad ollama[1625869]: INFO [main] model loaded | tid="139890847899648" timestamp=1733450232
+Dec 05 17:57:12 launchpad ollama[1687]: time=2024-12-05T17:57:12.498-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.25 seconds"
+Dec 05 17:57:22 launchpad ollama[1687]: [GIN] 2024/12/05 - 17:57:22 | 200 | 11.885280432s |       127.0.0.1 | POST     "/api/chat"
+Dec 06 08:19:51 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 06 08:19:51 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 06 08:19:51 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 06 08:19:51 launchpad systemd[1]: ollama.service: Consumed 1min 56.861s CPU time, received 1.3M IP traffic, sent 1.8M IP traffic.
+-- Boot b06d70c38d9d47bab758fba931dbb670 --
+Dec 06 08:21:52 launchpad systemd[1]: Started Server for local large language models.
+Dec 06 08:21:53 launchpad ollama[1676]: 2024/12/06 08:21:53 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]"
+Dec 06 08:21:53 launchpad ollama[1676]: time=2024-12-06T08:21:53.083-08:00 level=INFO source=images.go:704 msg="total blobs: 18"
+Dec 06 08:21:53 launchpad ollama[1676]: time=2024-12-06T08:21:53.087-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0"
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
+Dec 06 08:21:53 launchpad ollama[1676]:  - using env:        export GIN_MODE=release
+Dec 06 08:21:53 launchpad ollama[1676]:  - using code:        gin.SetMode(gin.ReleaseMode)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/pull                 --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/generate             --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/chat                 --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/embeddings           --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/create               --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/push                 --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/copy                 --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] DELETE /api/delete               --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/show                 --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] HEAD   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] GET    /api/ps                   --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /v1/chat/completions      --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] GET    /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] GET    /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] GET    /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] HEAD   /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] HEAD   /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] HEAD   /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: time=2024-12-06T08:21:53.088-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)"
+Dec 06 08:21:53 launchpad ollama[1676]: time=2024-12-06T08:21:53.089-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama1489032399/runners
+Dec 06 08:21:57 launchpad ollama[1676]: time=2024-12-06T08:21:57.009-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 06 08:21:57 launchpad ollama[1676]: time=2024-12-06T08:21:57.095-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.2 GiB"
+Dec 06 09:51:44 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 06 09:51:44 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 06 09:51:44 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 06 09:51:44 launchpad systemd[1]: ollama.service: Consumed 5.379s CPU time, no IP traffic.
+Dec 06 09:51:49 launchpad systemd[1]: Starting Server for local large language models...
+Dec 06 09:51:49 launchpad systemd[1]: Started Server for local large language models.
+Dec 06 09:51:49 launchpad ollama[629744]: 2024/12/06 09:51:49 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 06 09:51:49 launchpad ollama[629744]: time=2024-12-06T09:51:49.729-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 06 09:51:49 launchpad ollama[629744]: time=2024-12-06T09:51:49.729-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 06 09:51:49 launchpad ollama[629744]: time=2024-12-06T09:51:49.729-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 06 09:51:49 launchpad ollama[629744]: time=2024-12-06T09:51:49.729-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2737807184/runners
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.780-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.780-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.780-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.780-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.780-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.783-08:00 level=INFO source=gpu.go:568 msg="unable to load cuda driver library" library=/nix/store/6681919bklbwc4jadjrvq9an1ackx05g-nvidia-x11-560.35.03-6.6.30-rt30/lib/libcuda.so.560.35.03 error="cuda driver library init failure: 804"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.783-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.788-08:00 level=INFO source=gpu.go:347 msg="no compatible GPUs were discovered"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.788-08:00 level=INFO source=types.go:107 msg="inference compute" id=0 library=cpu variant=avx2 compute="" driver=0.0 name="" total="62.6 GiB" available="57.5 GiB"
+Dec 06 10:05:01 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 06 10:05:01 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 06 10:05:01 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 06 10:05:01 launchpad systemd[1]: ollama.service: Consumed 3.363s CPU time, 541M memory peak, 508.1M written to disk.
+-- Boot edbe8cb00f2a4fa7a88446dbef5420da --
+Dec 06 10:06:26 launchpad systemd[1]: Starting Server for local large language models...
+Dec 06 10:06:26 launchpad systemd[1]: Started Server for local large language models.
+Dec 06 10:06:26 launchpad ollama[1572]: 2024/12/06 10:06:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 06 10:06:26 launchpad ollama[1572]: time=2024-12-06T10:06:26.415-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 06 10:06:26 launchpad ollama[1572]: time=2024-12-06T10:06:26.421-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 06 10:06:26 launchpad ollama[1572]: time=2024-12-06T10:06:26.423-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 06 10:06:26 launchpad ollama[1572]: time=2024-12-06T10:06:26.425-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3167090512/runners
+Dec 06 10:06:29 launchpad ollama[1572]: time=2024-12-06T10:06:29.422-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Dec 06 10:06:29 launchpad ollama[1572]: time=2024-12-06T10:06:29.422-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 06 10:06:29 launchpad ollama[1572]: time=2024-12-06T10:06:29.423-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 10:06:29 launchpad ollama[1572]: time=2024-12-06T10:06:29.423-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 10:06:29 launchpad ollama[1572]: time=2024-12-06T10:06:29.423-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 10:06:29 launchpad ollama[1572]: time=2024-12-06T10:06:29.626-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 06 12:45:59 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 06 12:45:59 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 06 12:45:59 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 06 12:45:59 launchpad systemd[1]: ollama.service: Consumed 3.581s CPU time, 791.5M memory peak, 234.8M read from disk, 508.1M written to disk.
+-- Boot b9ef6a62028d4f349af64a0e5416b561 --
+Dec 06 12:46:32 launchpad systemd[1]: Starting Server for local large language models...
+Dec 06 12:46:32 launchpad systemd[1]: Started Server for local large language models.
+Dec 06 12:46:32 launchpad ollama[1573]: 2024/12/06 12:46:32 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 06 12:46:32 launchpad ollama[1573]: time=2024-12-06T12:46:32.719-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 06 12:46:32 launchpad ollama[1573]: time=2024-12-06T12:46:32.723-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 06 12:46:32 launchpad ollama[1573]: time=2024-12-06T12:46:32.724-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 06 12:46:32 launchpad ollama[1573]: time=2024-12-06T12:46:32.727-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1560843028/runners
+Dec 06 12:46:35 launchpad ollama[1573]: time=2024-12-06T12:46:35.709-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 06 12:46:35 launchpad ollama[1573]: time=2024-12-06T12:46:35.710-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 06 12:46:35 launchpad ollama[1573]: time=2024-12-06T12:46:35.710-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 12:46:35 launchpad ollama[1573]: time=2024-12-06T12:46:35.710-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 12:46:35 launchpad ollama[1573]: time=2024-12-06T12:46:35.710-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 12:46:35 launchpad ollama[1573]: time=2024-12-06T12:46:35.924-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 06 19:00:22 launchpad systemd[1]: Stopping Server for local large language models...
+-- Boot 65b3437b4fd147ec925a7e6bebfc3127 --
+Dec 07 08:29:25 launchpad systemd[1]: Starting Server for local large language models...
+Dec 07 08:29:25 launchpad systemd[1]: Started Server for local large language models.
+Dec 07 08:29:25 launchpad ollama[1648]: 2024/12/07 08:29:25 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 07 08:29:25 launchpad ollama[1648]: time=2024-12-07T08:29:25.305-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 07 08:29:25 launchpad ollama[1648]: time=2024-12-07T08:29:25.309-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 07 08:29:25 launchpad ollama[1648]: time=2024-12-07T08:29:25.311-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 07 08:29:25 launchpad ollama[1648]: time=2024-12-07T08:29:25.312-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama991934907/runners
+Dec 07 08:29:28 launchpad ollama[1648]: time=2024-12-07T08:29:28.288-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 07 08:29:28 launchpad ollama[1648]: time=2024-12-07T08:29:28.288-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 07 08:29:28 launchpad ollama[1648]: time=2024-12-07T08:29:28.288-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 07 08:29:28 launchpad ollama[1648]: time=2024-12-07T08:29:28.289-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 07 08:29:28 launchpad ollama[1648]: time=2024-12-07T08:29:28.289-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 07 08:29:28 launchpad ollama[1648]: time=2024-12-07T08:29:28.536-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.0 GiB"
+Dec 07 08:30:35 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 07 08:30:35 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 07 08:30:35 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 07 08:30:35 launchpad systemd[1]: ollama.service: Consumed 3.483s CPU time, 790.7M memory peak, 234.7M read from disk, 508.1M written to disk.
+-- Boot ce51437370b3419eabbfd1159e7325bc --
+Dec 07 08:31:07 launchpad systemd[1]: Starting Server for local large language models...
+Dec 07 08:31:07 launchpad systemd[1]: Started Server for local large language models.
+Dec 07 08:31:07 launchpad ollama[1650]: 2024/12/07 08:31:07 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 07 08:31:07 launchpad ollama[1650]: time=2024-12-07T08:31:07.843-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 07 08:31:07 launchpad ollama[1650]: time=2024-12-07T08:31:07.847-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 07 08:31:07 launchpad ollama[1650]: time=2024-12-07T08:31:07.848-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 07 08:31:07 launchpad ollama[1650]: time=2024-12-07T08:31:07.850-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1292331900/runners
+Dec 07 08:31:10 launchpad ollama[1650]: time=2024-12-07T08:31:10.840-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 07 08:31:10 launchpad ollama[1650]: time=2024-12-07T08:31:10.840-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 07 08:31:10 launchpad ollama[1650]: time=2024-12-07T08:31:10.840-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 07 08:31:10 launchpad ollama[1650]: time=2024-12-07T08:31:10.841-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 07 08:31:10 launchpad ollama[1650]: time=2024-12-07T08:31:10.841-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 07 08:31:11 launchpad ollama[1650]: time=2024-12-07T08:31:11.049-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 07 16:12:35 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:12:35 | 200 |     621.541µs |       127.0.0.1 | HEAD     "/"
+Dec 07 16:12:35 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:12:35 | 200 |    6.484323ms |       127.0.0.1 | POST     "/api/show"
+Dec 07 16:12:35 launchpad ollama[1650]: time=2024-12-07T16:12:35.985-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.120-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10494410752 required="9.2 GiB"
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.120-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.1 GiB" free_swap="68.9 GiB"
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.120-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.121-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 38187"
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.121-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.121-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.122-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 07 16:12:36 launchpad ollama[45616]: INFO [main] build info | build=0 commit="unknown" tid="139904028643328" timestamp=1733616756
+Dec 07 16:12:36 launchpad ollama[45616]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139904028643328" timestamp=1733616756 total_threads=16
+Dec 07 16:12:36 launchpad ollama[45616]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38187" tid="139904028643328" timestamp=1733616756
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 07 16:12:36 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 07 16:12:36 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 07 16:12:36 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 07 16:12:36 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.373-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 07 16:12:43 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 07 16:12:43 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 07 16:12:43 launchpad ollama[1650]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 07 16:12:43 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 07 16:12:43 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 07 16:12:44 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 07 16:12:44 launchpad ollama[45616]: INFO [main] model loaded | tid="139904028643328" timestamp=1733616764
+Dec 07 16:12:44 launchpad ollama[1650]: time=2024-12-07T16:12:44.901-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.78 seconds"
+Dec 07 16:12:44 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:12:44 | 200 |  8.919257897s |       127.0.0.1 | POST     "/api/generate"
+Dec 07 16:13:06 launchpad ollama[1650]: time=2024-12-07T16:13:06.296-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:13:09 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:13:09 | 200 |  2.879538725s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:14:20 launchpad ollama[1650]: time=2024-12-07T16:14:20.387-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:14:22 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:14:22 | 200 |  2.187351474s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:14:40 launchpad ollama[1650]: time=2024-12-07T16:14:40.007-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:14:43 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:14:43 | 200 |  3.754995679s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:15:53 launchpad ollama[1650]: time=2024-12-07T16:15:53.607-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:16:06 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:16:06 | 200 | 12.567518815s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:16:23 launchpad ollama[1650]: time=2024-12-07T16:16:23.127-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:16:27 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:16:27 | 200 |  4.511815046s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:16:41 launchpad ollama[1650]: time=2024-12-07T16:16:41.915-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:16:58 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:16:58 | 200 | 16.926235698s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:17:26 launchpad ollama[1650]: time=2024-12-07T16:17:26.849-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:17:29 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:17:29 | 200 |  2.208857201s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:17:39 launchpad ollama[1650]: time=2024-12-07T16:17:39.180-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:17:47 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:17:47 | 200 |  7.896416154s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 18:07:58 launchpad ollama[1650]: [GIN] 2024/12/07 - 18:07:58 | 200 |      14.163µs |       127.0.0.1 | HEAD     "/"
+Dec 07 18:07:58 launchpad ollama[1650]: [GIN] 2024/12/07 - 18:07:58 | 200 |    3.429683ms |       127.0.0.1 | POST     "/api/show"
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.550-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.678-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10334109696 required="9.2 GiB"
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.678-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.5 GiB" free_swap="68.9 GiB"
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.678-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.679-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 44783"
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.679-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.679-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.680-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 07 18:07:58 launchpad ollama[50241]: INFO [main] build info | build=0 commit="unknown" tid="140077486399488" timestamp=1733623678
+Dec 07 18:07:58 launchpad ollama[50241]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140077486399488" timestamp=1733623678 total_threads=16
+Dec 07 18:07:58 launchpad ollama[50241]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44783" tid="140077486399488" timestamp=1733623678
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 07 18:07:58 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 07 18:07:58 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 07 18:07:58 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 07 18:07:58 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.970-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 07 18:07:59 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 07 18:07:59 launchpad ollama[50241]: INFO [main] model loaded | tid="140077486399488" timestamp=1733623679
+Dec 07 18:07:59 launchpad ollama[1650]: time=2024-12-07T18:07:59.974-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 07 18:07:59 launchpad ollama[1650]: [GIN] 2024/12/07 - 18:07:59 | 200 |  1.427623749s |       127.0.0.1 | POST     "/api/generate"
+Dec 07 18:09:24 launchpad ollama[1650]: time=2024-12-07T18:09:24.306-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 18:09:33 launchpad ollama[1650]: [GIN] 2024/12/07 - 18:09:33 | 200 |  9.630707279s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 18:10:12 launchpad ollama[1650]: time=2024-12-07T18:10:12.324-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 18:10:19 launchpad ollama[1650]: [GIN] 2024/12/07 - 18:10:19 | 200 |  7.226828194s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.063-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.183-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10355539968 required="9.2 GiB"
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.183-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.4 GiB" free_swap="68.9 GiB"
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.184-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.185-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 45909"
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.186-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.186-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.186-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 07 18:15:20 launchpad ollama[50643]: INFO [main] build info | build=0 commit="unknown" tid="140596541751296" timestamp=1733624120
+Dec 07 18:15:20 launchpad ollama[50643]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140596541751296" timestamp=1733624120 total_threads=16
+Dec 07 18:15:20 launchpad ollama[50643]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45909" tid="140596541751296" timestamp=1733624120
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 07 18:15:20 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 07 18:15:20 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 07 18:15:20 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 07 18:15:20 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.476-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 07 18:15:21 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 07 18:15:21 launchpad ollama[50643]: INFO [main] model loaded | tid="140596541751296" timestamp=1733624121
+Dec 07 18:15:21 launchpad ollama[1650]: time=2024-12-07T18:15:21.479-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 07 18:15:32 launchpad ollama[1650]: [GIN] 2024/12/07 - 18:15:32 | 200 | 12.117083478s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.316-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.463-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.2 GiB" free_swap="68.9 GiB"
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.463-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.464-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 38879"
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.465-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.465-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.465-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 08:43:42 launchpad ollama[57815]: INFO [main] build info | build=0 commit="unknown" tid="140455179128832" timestamp=1733676222
+Dec 08 08:43:42 launchpad ollama[57815]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140455179128832" timestamp=1733676222 total_threads=16
+Dec 08 08:43:42 launchpad ollama[57815]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38879" tid="140455179128832" timestamp=1733676222
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 08:43:42 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 08:43:42 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 08:43:42 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 08:43:42 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.752-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 08:43:43 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 08:43:43 launchpad ollama[57815]: INFO [main] model loaded | tid="140455179128832" timestamp=1733676223
+Dec 08 08:43:43 launchpad ollama[1650]: time=2024-12-08T08:43:43.755-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 08:44:04 launchpad ollama[1650]: [GIN] 2024/12/08 - 08:44:04 | 200 | 21.852098012s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.683-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.832-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.2 GiB" free_swap="68.9 GiB"
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.833-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.834-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 45015"
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.834-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.834-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.834-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 08:50:00 launchpad ollama[61638]: INFO [main] build info | build=0 commit="unknown" tid="140066033774592" timestamp=1733676600
+Dec 08 08:50:00 launchpad ollama[61638]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140066033774592" timestamp=1733676600 total_threads=16
+Dec 08 08:50:00 launchpad ollama[61638]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45015" tid="140066033774592" timestamp=1733676600
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 08:50:00 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 08:50:00 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 08:50:00 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 08:50:00 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: time=2024-12-08T08:50:01.126-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 08:50:01 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 08:50:01 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 08:50:01 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 08:50:01 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 08:50:01 launchpad ollama[61638]: INFO [main] model loaded | tid="140066033774592" timestamp=1733676601
+Dec 08 08:50:02 launchpad ollama[1650]: time=2024-12-08T08:50:02.130-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 08 08:50:31 launchpad ollama[1650]: [GIN] 2024/12/08 - 08:50:31 | 200 | 30.446080939s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.696-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.848-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.2 GiB" free_swap="68.9 GiB"
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.848-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.849-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 42045"
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.849-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.849-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.850-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 09:20:57 launchpad ollama[66994]: INFO [main] build info | build=0 commit="unknown" tid="140673853370368" timestamp=1733678457
+Dec 08 09:20:57 launchpad ollama[66994]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140673853370368" timestamp=1733678457 total_threads=16
+Dec 08 09:20:57 launchpad ollama[66994]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42045" tid="140673853370368" timestamp=1733678457
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 09:20:57 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 09:20:57 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 09:20:57 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 09:20:57 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: time=2024-12-08T09:20:58.139-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 09:20:58 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 09:20:58 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 09:20:58 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 09:20:58 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 09:20:58 launchpad ollama[66994]: INFO [main] model loaded | tid="140673853370368" timestamp=1733678458
+Dec 08 09:20:59 launchpad ollama[1650]: time=2024-12-08T09:20:59.143-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 09:21:09 launchpad ollama[1650]: [GIN] 2024/12/08 - 09:21:09 | 200 | 12.067512562s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.170-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.317-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.1 GiB" free_swap="68.9 GiB"
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.318-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.319-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 35631"
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.319-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.319-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.319-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 09:32:03 launchpad ollama[68579]: INFO [main] build info | build=0 commit="unknown" tid="139818168209408" timestamp=1733679123
+Dec 08 09:32:03 launchpad ollama[68579]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139818168209408" timestamp=1733679123 total_threads=16
+Dec 08 09:32:03 launchpad ollama[68579]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35631" tid="139818168209408" timestamp=1733679123
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 09:32:03 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 09:32:03 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 09:32:03 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 09:32:03 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.609-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 09:32:04 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 09:32:04 launchpad ollama[68579]: INFO [main] model loaded | tid="139818168209408" timestamp=1733679124
+Dec 08 09:32:04 launchpad ollama[1650]: time=2024-12-08T09:32:04.613-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 09:32:13 launchpad ollama[1650]: [GIN] 2024/12/08 - 09:32:13 | 200 |  9.844417818s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 09:36:13 launchpad ollama[1650]: time=2024-12-08T09:36:13.581-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 09:36:20 launchpad ollama[1650]: [GIN] 2024/12/08 - 09:36:20 | 200 |  7.068664686s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 09:39:03 launchpad ollama[1650]: time=2024-12-08T09:39:03.802-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 09:39:16 launchpad ollama[1650]: [GIN] 2024/12/08 - 09:39:16 | 200 | 12.658383022s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 09:40:45 launchpad ollama[1650]: time=2024-12-08T09:40:45.710-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 09:40:56 launchpad ollama[1650]: [GIN] 2024/12/08 - 09:40:56 | 200 | 10.414743911s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 09:45:19 launchpad ollama[1650]: time=2024-12-08T09:45:19.928-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 09:45:34 launchpad ollama[1650]: [GIN] 2024/12/08 - 09:45:34 | 200 | 14.751119658s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.649-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.793-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.1 GiB" free_swap="68.9 GiB"
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.793-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.794-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 34377"
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.794-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.794-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.794-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 10:13:47 launchpad ollama[77261]: INFO [main] build info | build=0 commit="unknown" tid="140252596342784" timestamp=1733681627
+Dec 08 10:13:47 launchpad ollama[77261]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140252596342784" timestamp=1733681627 total_threads=16
+Dec 08 10:13:47 launchpad ollama[77261]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34377" tid="140252596342784" timestamp=1733681627
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 10:13:47 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 10:13:47 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 10:13:47 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 10:13:47 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: time=2024-12-08T10:13:48.088-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 10:13:48 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 10:13:48 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 10:13:48 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 10:13:48 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 10:13:48 launchpad ollama[77261]: INFO [main] model loaded | tid="140252596342784" timestamp=1733681628
+Dec 08 10:13:49 launchpad ollama[1650]: time=2024-12-08T10:13:49.092-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 08 10:14:06 launchpad ollama[1650]: [GIN] 2024/12/08 - 10:14:06 | 200 | 18.756867855s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.234-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.384-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.1 GiB" free_swap="68.9 GiB"
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.384-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.385-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 43061"
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.385-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.385-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.385-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 10:20:00 launchpad ollama[80016]: INFO [main] build info | build=0 commit="unknown" tid="140602753769472" timestamp=1733682000
+Dec 08 10:20:00 launchpad ollama[80016]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140602753769472" timestamp=1733682000 total_threads=16
+Dec 08 10:20:00 launchpad ollama[80016]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43061" tid="140602753769472" timestamp=1733682000
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 10:20:00 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 10:20:00 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 10:20:00 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 10:20:00 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.674-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 10:20:01 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 10:20:01 launchpad ollama[80016]: INFO [main] model loaded | tid="140602753769472" timestamp=1733682001
+Dec 08 10:20:01 launchpad ollama[1650]: time=2024-12-08T10:20:01.678-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 10:20:18 launchpad ollama[1650]: [GIN] 2024/12/08 - 10:20:18 | 200 | 18.129676296s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.598-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.743-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.1 GiB" free_swap="68.9 GiB"
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.743-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.744-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 43375"
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.744-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.744-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.745-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 10:26:55 launchpad ollama[82681]: INFO [main] build info | build=0 commit="unknown" tid="140292201209856" timestamp=1733682415
+Dec 08 10:26:55 launchpad ollama[82681]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140292201209856" timestamp=1733682415 total_threads=16
+Dec 08 10:26:55 launchpad ollama[82681]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43375" tid="140292201209856" timestamp=1733682415
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 10:26:55 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 10:26:55 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 10:26:55 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 10:26:55 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: time=2024-12-08T10:26:56.033-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 10:26:56 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 10:26:56 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 10:26:56 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 10:26:56 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 10:26:56 launchpad ollama[82681]: INFO [main] model loaded | tid="140292201209856" timestamp=1733682416
+Dec 08 10:26:57 launchpad ollama[1650]: time=2024-12-08T10:26:57.036-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 10:27:20 launchpad ollama[1650]: [GIN] 2024/12/08 - 10:27:20 | 200 | 24.559503085s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 10:30:59 launchpad ollama[1650]: time=2024-12-08T10:30:59.211-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 10:31:07 launchpad ollama[1650]: [GIN] 2024/12/08 - 10:31:07 | 200 |  8.074165886s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 10:34:21 launchpad ollama[1650]: time=2024-12-08T10:34:21.741-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 10:34:39 launchpad ollama[1650]: [GIN] 2024/12/08 - 10:34:39 | 200 | 17.802571239s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.456-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.604-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.9 GiB" free_swap="68.9 GiB"
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.605-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.606-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 33281"
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.606-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.606-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.606-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 11:45:54 launchpad ollama[91373]: INFO [main] build info | build=0 commit="unknown" tid="140551050502144" timestamp=1733687154
+Dec 08 11:45:54 launchpad ollama[91373]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140551050502144" timestamp=1733687154 total_threads=16
+Dec 08 11:45:54 launchpad ollama[91373]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33281" tid="140551050502144" timestamp=1733687154
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 11:45:54 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 11:45:54 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 11:45:54 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 11:45:54 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.898-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 11:45:55 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 11:45:55 launchpad ollama[91373]: INFO [main] model loaded | tid="140551050502144" timestamp=1733687155
+Dec 08 11:45:55 launchpad ollama[1650]: time=2024-12-08T11:45:55.902-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 08 11:46:07 launchpad ollama[1650]: [GIN] 2024/12/08 - 11:46:07 | 200 |   13.2517165s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 11:47:18 launchpad ollama[1650]: time=2024-12-08T11:47:18.789-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 11:47:28 launchpad ollama[1650]: [GIN] 2024/12/08 - 11:47:28 | 200 |  9.932454336s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.157-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.305-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.9 GiB" free_swap="68.9 GiB"
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.305-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.306-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 34157"
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.307-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.307-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.307-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 12:00:41 launchpad ollama[94610]: INFO [main] build info | build=0 commit="unknown" tid="139958489665536" timestamp=1733688041
+Dec 08 12:00:41 launchpad ollama[94610]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139958489665536" timestamp=1733688041 total_threads=16
+Dec 08 12:00:41 launchpad ollama[94610]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34157" tid="139958489665536" timestamp=1733688041
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 12:00:41 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 12:00:41 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 12:00:41 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 12:00:41 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.595-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 12:00:42 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 12:00:42 launchpad ollama[94610]: INFO [main] model loaded | tid="139958489665536" timestamp=1733688042
+Dec 08 12:00:42 launchpad ollama[1650]: time=2024-12-08T12:00:42.599-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 12:00:55 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:00:55 | 200 | 14.138865344s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:02:53 launchpad ollama[1650]: time=2024-12-08T12:02:53.349-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:03:02 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:03:02 | 200 |  9.336662279s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:28:07 launchpad ollama[1650]: time=2024-12-08T12:28:07.995-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.146-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.146-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.147-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 33187"
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.148-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.148-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.148-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 12:28:08 launchpad ollama[97750]: INFO [main] build info | build=0 commit="unknown" tid="140674044354560" timestamp=1733689688
+Dec 08 12:28:08 launchpad ollama[97750]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140674044354560" timestamp=1733689688 total_threads=16
+Dec 08 12:28:08 launchpad ollama[97750]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33187" tid="140674044354560" timestamp=1733689688
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 12:28:08 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 12:28:08 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 12:28:08 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 12:28:08 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.438-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 12:28:09 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 12:28:09 launchpad ollama[97750]: INFO [main] model loaded | tid="140674044354560" timestamp=1733689689
+Dec 08 12:28:09 launchpad ollama[1650]: time=2024-12-08T12:28:09.442-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 12:28:31 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:28:31 | 200 | 23.477297113s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:29:42 launchpad ollama[1650]: time=2024-12-08T12:29:42.948-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:29:55 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:29:55 | 200 | 12.239291089s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:33:39 launchpad ollama[1650]: time=2024-12-08T12:33:39.317-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:33:52 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:33:52 | 200 | 13.105667696s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.021-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.176-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.176-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.177-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 40743"
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.177-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.177-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.177-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 12:46:06 launchpad ollama[105224]: INFO [main] build info | build=0 commit="unknown" tid="139692666245120" timestamp=1733690766
+Dec 08 12:46:06 launchpad ollama[105224]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139692666245120" timestamp=1733690766 total_threads=16
+Dec 08 12:46:06 launchpad ollama[105224]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40743" tid="139692666245120" timestamp=1733690766
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 12:46:06 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 12:46:06 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 12:46:06 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 12:46:06 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.468-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 12:46:07 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 12:46:07 launchpad ollama[105224]: INFO [main] model loaded | tid="139692666245120" timestamp=1733690767
+Dec 08 12:46:07 launchpad ollama[1650]: time=2024-12-08T12:46:07.471-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 12:46:20 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:46:20 | 200 | 14.605562135s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.148-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.296-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.296-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.297-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 41469"
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.297-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.297-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.297-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 12:58:13 launchpad ollama[107236]: INFO [main] build info | build=0 commit="unknown" tid="139777021140992" timestamp=1733691493
+Dec 08 12:58:13 launchpad ollama[107236]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139777021140992" timestamp=1733691493 total_threads=16
+Dec 08 12:58:13 launchpad ollama[107236]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41469" tid="139777021140992" timestamp=1733691493
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 12:58:13 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 12:58:13 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 12:58:13 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 12:58:13 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.589-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 12:58:14 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 12:58:14 launchpad ollama[107236]: INFO [main] model loaded | tid="139777021140992" timestamp=1733691494
+Dec 08 12:58:14 launchpad ollama[1650]: time=2024-12-08T12:58:14.593-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 08 12:58:25 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:58:25 | 200 | 12.497340701s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.871-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.991-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.992-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.993-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 36551"
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.993-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.993-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.993-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 13:03:27 launchpad ollama[108920]: INFO [main] build info | build=0 commit="unknown" tid="139995819257856" timestamp=1733691807
+Dec 08 13:03:27 launchpad ollama[108920]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139995819257856" timestamp=1733691807 total_threads=16
+Dec 08 13:03:27 launchpad ollama[108920]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36551" tid="139995819257856" timestamp=1733691807
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 13:03:27 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 13:03:27 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 13:03:27 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 13:03:27 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: time=2024-12-08T13:03:27.283-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 13:03:27 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 13:03:28 launchpad ollama[108920]: INFO [main] model loaded | tid="139995819257856" timestamp=1733691808
+Dec 08 13:03:28 launchpad ollama[1650]: time=2024-12-08T13:03:28.287-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 13:03:43 launchpad ollama[1650]: [GIN] 2024/12/08 - 13:03:43 | 200 | 16.335701792s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.709-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.859-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.860-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.861-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 45133"
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.861-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.861-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.861-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 13:35:58 launchpad ollama[111252]: INFO [main] build info | build=0 commit="unknown" tid="139995389186048" timestamp=1733693758
+Dec 08 13:35:58 launchpad ollama[111252]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139995389186048" timestamp=1733693758 total_threads=16
+Dec 08 13:35:58 launchpad ollama[111252]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45133" tid="139995389186048" timestamp=1733693758
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 13:35:58 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 13:35:58 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 13:35:58 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 13:35:58 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: time=2024-12-08T13:35:59.149-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 13:35:59 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 13:35:59 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 13:35:59 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 13:35:59 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 13:35:59 launchpad ollama[111252]: INFO [main] model loaded | tid="139995389186048" timestamp=1733693759
+Dec 08 13:36:00 launchpad ollama[1650]: time=2024-12-08T13:36:00.152-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 13:36:13 launchpad ollama[1650]: [GIN] 2024/12/08 - 13:36:13 | 200 | 14.457473019s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 13:37:03 launchpad ollama[1650]: time=2024-12-08T13:37:03.557-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 13:37:07 launchpad ollama[1650]: [GIN] 2024/12/08 - 13:37:07 | 200 |  4.419889077s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.501-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.650-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.650-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.651-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 33725"
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.651-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.651-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.651-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 13:54:17 launchpad ollama[113549]: INFO [main] build info | build=0 commit="unknown" tid="139963495624704" timestamp=1733694857
+Dec 08 13:54:17 launchpad ollama[113549]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139963495624704" timestamp=1733694857 total_threads=16
+Dec 08 13:54:17 launchpad ollama[113549]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33725" tid="139963495624704" timestamp=1733694857
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 13:54:17 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 13:54:17 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 13:54:17 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 13:54:17 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.940-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 13:54:18 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 13:54:18 launchpad ollama[113549]: INFO [main] model loaded | tid="139963495624704" timestamp=1733694858
+Dec 08 13:54:18 launchpad ollama[1650]: time=2024-12-08T13:54:18.944-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 13:54:31 launchpad ollama[1650]: [GIN] 2024/12/08 - 13:54:31 | 200 | 13.913799814s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.824-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.971-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.971-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.972-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 35817"
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.972-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.972-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.973-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 14:03:33 launchpad ollama[115393]: INFO [main] build info | build=0 commit="unknown" tid="140507858587648" timestamp=1733695413
+Dec 08 14:03:33 launchpad ollama[115393]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140507858587648" timestamp=1733695413 total_threads=16
+Dec 08 14:03:33 launchpad ollama[115393]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35817" tid="140507858587648" timestamp=1733695413
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 14:03:34 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 14:03:34 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 14:03:34 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 14:03:34 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: time=2024-12-08T14:03:34.265-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 14:03:34 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 14:03:35 launchpad ollama[115393]: INFO [main] model loaded | tid="140507858587648" timestamp=1733695415
+Dec 08 14:03:35 launchpad ollama[1650]: time=2024-12-08T14:03:35.269-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 08 14:03:42 launchpad ollama[1650]: [GIN] 2024/12/08 - 14:03:42 | 200 |  8.825813962s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 14:12:53 launchpad ollama[1650]: time=2024-12-08T14:12:53.929-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.079-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.079-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.080-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 38301"
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.080-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.080-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.080-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 14:12:54 launchpad ollama[116242]: INFO [main] build info | build=0 commit="unknown" tid="140157593313280" timestamp=1733695974
+Dec 08 14:12:54 launchpad ollama[116242]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140157593313280" timestamp=1733695974 total_threads=16
+Dec 08 14:12:54 launchpad ollama[116242]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38301" tid="140157593313280" timestamp=1733695974
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 14:12:54 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 14:12:54 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 14:12:54 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 14:12:54 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.372-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 14:12:55 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 14:12:55 launchpad ollama[116242]: INFO [main] model loaded | tid="140157593313280" timestamp=1733695975
+Dec 08 14:12:55 launchpad ollama[1650]: time=2024-12-08T14:12:55.376-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 08 14:13:21 launchpad ollama[1650]: [GIN] 2024/12/08 - 14:13:21 | 200 | 27.641394282s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.306-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.463-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.463-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.464-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 40551"
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.464-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.464-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.465-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 14:27:25 launchpad ollama[120773]: INFO [main] build info | build=0 commit="unknown" tid="140232535138304" timestamp=1733696845
+Dec 08 14:27:25 launchpad ollama[120773]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140232535138304" timestamp=1733696845 total_threads=16
+Dec 08 14:27:25 launchpad ollama[120773]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40551" tid="140232535138304" timestamp=1733696845
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 14:27:25 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 14:27:25 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 14:27:25 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 14:27:25 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.755-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 14:27:26 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 14:27:26 launchpad ollama[120773]: INFO [main] model loaded | tid="140232535138304" timestamp=1733696846
+Dec 08 14:27:26 launchpad ollama[1650]: time=2024-12-08T14:27:26.758-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 14:27:43 launchpad ollama[1650]: [GIN] 2024/12/08 - 14:27:43 | 200 | 18.686273917s |       127.0.0.1 | POST     "/api/chat"
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.185-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.329-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.1 GiB" free_swap="68.9 GiB"
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.329-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.330-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 44955"
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.331-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.331-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.331-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 09 15:29:30 launchpad ollama[150766]: INFO [main] build info | build=0 commit="unknown" tid="140070391537664" timestamp=1733786970
+Dec 09 15:29:30 launchpad ollama[150766]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140070391537664" timestamp=1733786970 total_threads=16
+Dec 09 15:29:30 launchpad ollama[150766]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44955" tid="140070391537664" timestamp=1733786970
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 09 15:29:30 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 09 15:29:30 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 09 15:29:30 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 09 15:29:30 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.619-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 09 15:29:31 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 09 15:29:31 launchpad ollama[150766]: INFO [main] model loaded | tid="140070391537664" timestamp=1733786971
+Dec 09 15:29:31 launchpad ollama[1650]: time=2024-12-09T15:29:31.623-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 09 15:30:03 launchpad ollama[1650]: [GIN] 2024/12/09 - 15:30:03 | 200 | 33.295449291s |       127.0.0.1 | POST     "/api/chat"
+Dec 09 15:30:07 launchpad ollama[1650]: time=2024-12-09T15:30:07.828-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 09 15:30:26 launchpad ollama[1650]: [GIN] 2024/12/09 - 15:30:26 | 200 | 18.859048206s |       127.0.0.1 | POST     "/api/chat"
+Dec 09 15:30:54 launchpad ollama[1650]: time=2024-12-09T15:30:54.112-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 09 15:31:05 launchpad ollama[1650]: [GIN] 2024/12/09 - 15:31:05 | 200 | 11.163934187s |       127.0.0.1 | POST     "/api/chat"
+Dec 09 15:31:29 launchpad ollama[1650]: time=2024-12-09T15:31:29.056-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 09 15:31:37 launchpad ollama[1650]: [GIN] 2024/12/09 - 15:31:37 | 200 |  8.885529008s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.233-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.379-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.1 GiB" free_swap="68.9 GiB"
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.379-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.380-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 35339"
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.380-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.380-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.380-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 09:00:34 launchpad ollama[174677]: INFO [main] build info | build=0 commit="unknown" tid="140442120286208" timestamp=1733850034
+Dec 10 09:00:34 launchpad ollama[174677]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140442120286208" timestamp=1733850034 total_threads=16
+Dec 10 09:00:34 launchpad ollama[174677]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35339" tid="140442120286208" timestamp=1733850034
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 09:00:34 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 09:00:34 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 09:00:34 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 09:00:34 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.674-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 09:00:35 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 09:00:35 launchpad ollama[174677]: INFO [main] model loaded | tid="140442120286208" timestamp=1733850035
+Dec 10 09:00:35 launchpad ollama[1650]: time=2024-12-10T09:00:35.679-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 09:00:49 launchpad ollama[1650]: [GIN] 2024/12/10 - 09:00:49 | 200 | 15.598580998s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 09:05:28 launchpad ollama[1650]: time=2024-12-10T09:05:28.730-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 09:05:50 launchpad ollama[1650]: [GIN] 2024/12/10 - 09:05:50 | 200 | 21.631372507s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 09:06:57 launchpad ollama[1650]: time=2024-12-10T09:06:57.695-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 09:07:18 launchpad ollama[1650]: [GIN] 2024/12/10 - 09:07:18 | 200 | 20.776754378s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 09:10:40 launchpad ollama[1650]: time=2024-12-10T09:10:40.942-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 09:11:30 launchpad ollama[1650]: [GIN] 2024/12/10 - 09:11:30 | 200 | 50.042077717s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.715-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.866-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.866-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.867-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 36685"
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.867-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.867-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.867-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 10:00:18 launchpad ollama[204106]: INFO [main] build info | build=0 commit="unknown" tid="140199024840704" timestamp=1733853618
+Dec 10 10:00:18 launchpad ollama[204106]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140199024840704" timestamp=1733853618 total_threads=16
+Dec 10 10:00:18 launchpad ollama[204106]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36685" tid="140199024840704" timestamp=1733853618
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 10:00:18 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 10:00:18 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 10:00:18 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 10:00:18 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: time=2024-12-10T10:00:19.159-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 10:00:19 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 10:00:19 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 10:00:19 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 10:00:19 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 10:00:19 launchpad ollama[204106]: INFO [main] model loaded | tid="140199024840704" timestamp=1733853619
+Dec 10 10:00:20 launchpad ollama[1650]: time=2024-12-10T10:00:20.163-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 10:00:41 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:00:41 | 200 |  23.17640591s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:02:48 launchpad ollama[1650]: time=2024-12-10T10:02:48.458-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:03:04 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:03:04 | 200 |  15.57340047s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:03:26 launchpad ollama[1650]: time=2024-12-10T10:03:26.485-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:03:44 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:03:44 | 200 | 18.405971791s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:04:51 launchpad ollama[1650]: time=2024-12-10T10:04:51.604-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:05:09 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:05:09 | 200 | 17.662406949s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:12:40 launchpad ollama[1650]: time=2024-12-10T10:12:40.988-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.139-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.139-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.140-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 42635"
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.140-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.140-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.141-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 10:12:41 launchpad ollama[221878]: INFO [main] build info | build=0 commit="unknown" tid="140154409959424" timestamp=1733854361
+Dec 10 10:12:41 launchpad ollama[221878]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140154409959424" timestamp=1733854361 total_threads=16
+Dec 10 10:12:41 launchpad ollama[221878]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42635" tid="140154409959424" timestamp=1733854361
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 10:12:41 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 10:12:41 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 10:12:41 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 10:12:41 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.433-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 10:12:42 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 10:12:42 launchpad ollama[221878]: INFO [main] model loaded | tid="140154409959424" timestamp=1733854362
+Dec 10 10:12:42 launchpad ollama[1650]: time=2024-12-10T10:12:42.437-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 10:12:49 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:12:49 | 200 |  8.234829757s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:15:39 launchpad ollama[1650]: time=2024-12-10T10:15:39.237-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:15:54 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:15:54 | 200 | 15.748583648s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.120-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.275-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.275-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.276-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 39317"
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.277-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.277-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.277-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 10:23:56 launchpad ollama[227340]: INFO [main] build info | build=0 commit="unknown" tid="139865239724032" timestamp=1733855036
+Dec 10 10:23:56 launchpad ollama[227340]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139865239724032" timestamp=1733855036 total_threads=16
+Dec 10 10:23:56 launchpad ollama[227340]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39317" tid="139865239724032" timestamp=1733855036
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 10:23:56 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 10:23:56 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 10:23:56 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 10:23:56 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.569-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 10:23:57 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 10:23:57 launchpad ollama[227340]: INFO [main] model loaded | tid="139865239724032" timestamp=1733855037
+Dec 10 10:23:57 launchpad ollama[1650]: time=2024-12-10T10:23:57.573-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 10:24:12 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:24:12 | 200 | 16.631156179s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:27:59 launchpad ollama[1650]: time=2024-12-10T10:27:59.553-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:28:12 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:28:12 | 200 | 12.642030634s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:30:13 launchpad ollama[1650]: time=2024-12-10T10:30:13.028-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:30:18 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:30:18 | 200 |  5.024615826s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:31:48 launchpad ollama[1650]: time=2024-12-10T10:31:48.856-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:31:53 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:31:53 | 200 |  5.079846493s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.455-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.602-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.1 GiB" free_swap="68.9 GiB"
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.602-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.603-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 42969"
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.603-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.603-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.603-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 10:56:43 launchpad ollama[235209]: INFO [main] build info | build=0 commit="unknown" tid="140238677946368" timestamp=1733857003
+Dec 10 10:56:43 launchpad ollama[235209]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140238677946368" timestamp=1733857003 total_threads=16
+Dec 10 10:56:43 launchpad ollama[235209]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42969" tid="140238677946368" timestamp=1733857003
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 10:56:43 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 10:56:43 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 10:56:43 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 10:56:43 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.891-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 10:56:44 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 10:56:44 launchpad ollama[235209]: INFO [main] model loaded | tid="140238677946368" timestamp=1733857004
+Dec 10 10:56:44 launchpad ollama[1650]: time=2024-12-10T10:56:44.895-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 10 10:57:14 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:57:14 | 200 | 31.280194103s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 11:01:27 launchpad ollama[1650]: time=2024-12-10T11:01:27.257-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 11:03:13 launchpad ollama[1650]: [GIN] 2024/12/10 - 11:03:13 | 200 |         1m45s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.712-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.855-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.1 GiB" free_swap="68.9 GiB"
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.856-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.857-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 39523"
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.857-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.857-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.857-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 11:17:34 launchpad ollama[272575]: INFO [main] build info | build=0 commit="unknown" tid="140045018062848" timestamp=1733858254
+Dec 10 11:17:34 launchpad ollama[272575]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140045018062848" timestamp=1733858254 total_threads=16
+Dec 10 11:17:34 launchpad ollama[272575]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39523" tid="140045018062848" timestamp=1733858254
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 11:17:34 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 11:17:34 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 11:17:34 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 11:17:34 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: time=2024-12-10T11:17:35.160-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 11:17:35 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 11:17:35 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 11:17:35 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 11:17:35 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 11:17:35 launchpad ollama[272575]: INFO [main] model loaded | tid="140045018062848" timestamp=1733858255
+Dec 10 11:17:36 launchpad ollama[1650]: time=2024-12-10T11:17:36.164-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Dec 10 11:17:42 launchpad ollama[1650]: [GIN] 2024/12/10 - 11:17:42 | 200 |  7.613038765s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.637-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.790-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.1 GiB" free_swap="68.9 GiB"
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.791-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.792-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 39073"
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.792-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.792-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.792-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 11:28:04 launchpad ollama[274621]: INFO [main] build info | build=0 commit="unknown" tid="140715010023424" timestamp=1733858884
+Dec 10 11:28:04 launchpad ollama[274621]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140715010023424" timestamp=1733858884 total_threads=16
+Dec 10 11:28:04 launchpad ollama[274621]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39073" tid="140715010023424" timestamp=1733858884
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 11:28:04 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 11:28:04 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 11:28:04 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 11:28:04 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: time=2024-12-10T11:28:05.086-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 11:28:05 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 11:28:05 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 11:28:05 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 11:28:05 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 11:28:05 launchpad ollama[274621]: INFO [main] model loaded | tid="140715010023424" timestamp=1733858885
+Dec 10 11:28:06 launchpad ollama[1650]: time=2024-12-10T11:28:06.090-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 11:28:14 launchpad ollama[1650]: [GIN] 2024/12/10 - 11:28:14 | 200 |  9.443002105s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.080-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.224-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.224-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.225-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 43849"
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.225-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.225-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.226-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 12:48:11 launchpad ollama[277815]: INFO [main] build info | build=0 commit="unknown" tid="140425342947328" timestamp=1733863691
+Dec 10 12:48:11 launchpad ollama[277815]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140425342947328" timestamp=1733863691 total_threads=16
+Dec 10 12:48:11 launchpad ollama[277815]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43849" tid="140425342947328" timestamp=1733863691
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 12:48:11 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 12:48:11 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 12:48:11 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 12:48:11 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.515-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 12:48:12 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 12:48:12 launchpad ollama[277815]: INFO [main] model loaded | tid="140425342947328" timestamp=1733863692
+Dec 10 12:48:12 launchpad ollama[1650]: time=2024-12-10T12:48:12.520-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 10 12:48:19 launchpad ollama[1650]: [GIN] 2024/12/10 - 12:48:19 | 200 |  8.712496404s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 12:50:22 launchpad ollama[1650]: time=2024-12-10T12:50:22.507-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 12:50:29 launchpad ollama[1650]: [GIN] 2024/12/10 - 12:50:29 | 200 |  7.095418596s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 12:51:39 launchpad ollama[1650]: time=2024-12-10T12:51:39.212-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 12:51:55 launchpad ollama[1650]: [GIN] 2024/12/10 - 12:51:55 | 200 | 16.042730667s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 12:53:02 launchpad ollama[1650]: time=2024-12-10T12:53:02.035-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 12:53:11 launchpad ollama[1650]: [GIN] 2024/12/10 - 12:53:11 | 200 |  9.333209071s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 12:56:23 launchpad ollama[1650]: time=2024-12-10T12:56:23.844-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 12:56:59 launchpad ollama[1650]: [GIN] 2024/12/10 - 12:56:59 | 200 | 35.407892399s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.300-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.454-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.454-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.455-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 34605"
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.455-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.455-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.455-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 13:18:48 launchpad ollama[299175]: INFO [main] build info | build=0 commit="unknown" tid="140344654024704" timestamp=1733865528
+Dec 10 13:18:48 launchpad ollama[299175]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140344654024704" timestamp=1733865528 total_threads=16
+Dec 10 13:18:48 launchpad ollama[299175]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34605" tid="140344654024704" timestamp=1733865528
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 13:18:48 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 13:18:48 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 13:18:48 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 13:18:48 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.745-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 13:18:49 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 13:18:49 launchpad ollama[299175]: INFO [main] model loaded | tid="140344654024704" timestamp=1733865529
+Dec 10 13:18:49 launchpad ollama[1650]: time=2024-12-10T13:18:49.749-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 10 13:19:13 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:19:13 | 200 | 25.362247142s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:20:55 launchpad ollama[1650]: time=2024-12-10T13:20:55.361-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:21:05 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:21:05 | 200 | 10.381297034s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:22:27 launchpad ollama[1650]: time=2024-12-10T13:22:27.545-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:23:02 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:23:02 | 200 | 34.594122762s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:27:11 launchpad ollama[1650]: time=2024-12-10T13:27:11.640-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:27:28 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:27:28 | 200 | 16.383890466s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:27:57 launchpad ollama[1650]: time=2024-12-10T13:27:57.453-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:28:05 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:28:05 | 200 |  8.166409833s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:28:27 launchpad ollama[1650]: time=2024-12-10T13:28:27.718-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:28:59 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:28:59 | 200 | 31.323937518s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:29:17 launchpad ollama[1650]: time=2024-12-10T13:29:17.217-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:29:28 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:29:28 | 200 |  11.66648602s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:30:25 launchpad ollama[1650]: time=2024-12-10T13:30:25.910-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:30:30 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:30:30 | 200 |  4.839795998s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.604-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.757-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.758-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.759-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 33245"
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.759-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.759-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.759-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 13:36:36 launchpad ollama[332423]: INFO [main] build info | build=0 commit="unknown" tid="140223374622720" timestamp=1733866596
+Dec 10 13:36:36 launchpad ollama[332423]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140223374622720" timestamp=1733866596 total_threads=16
+Dec 10 13:36:36 launchpad ollama[332423]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33245" tid="140223374622720" timestamp=1733866596
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 13:36:36 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 13:36:36 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 13:36:36 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 13:36:36 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: time=2024-12-10T13:36:37.050-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 13:36:37 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 13:36:37 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 13:36:37 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 13:36:37 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 13:36:37 launchpad ollama[332423]: INFO [main] model loaded | tid="140223374622720" timestamp=1733866597
+Dec 10 13:36:38 launchpad ollama[1650]: time=2024-12-10T13:36:38.054-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 10 13:36:42 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:36:42 | 200 |   5.49179454s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:37:10 launchpad ollama[1650]: time=2024-12-10T13:37:10.150-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:37:15 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:37:15 | 200 |  5.598442864s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:37:57 launchpad ollama[1650]: time=2024-12-10T13:37:57.357-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:38:05 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:38:05 | 200 |  8.616287505s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:38:47 launchpad ollama[1650]: time=2024-12-10T13:38:47.329-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:38:53 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:38:53 | 200 |  6.271213617s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:42:40 launchpad ollama[1650]: time=2024-12-10T13:42:40.071-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:42:58 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:42:58 | 200 | 18.673486435s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:43:24 launchpad ollama[1650]: time=2024-12-10T13:43:24.174-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:43:32 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:43:32 | 200 |  8.166034494s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:47:54 launchpad ollama[1650]: time=2024-12-10T13:47:54.090-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:48:36 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:48:36 | 200 | 42.472811818s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:49:17 launchpad ollama[1650]: time=2024-12-10T13:49:17.497-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:49:48 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:49:48 | 200 | 30.754339304s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.774-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.929-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.930-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.931-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 36223"
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.931-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.931-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.931-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 14:01:25 launchpad ollama[362365]: INFO [main] build info | build=0 commit="unknown" tid="140392264495104" timestamp=1733868085
+Dec 10 14:01:25 launchpad ollama[362365]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140392264495104" timestamp=1733868085 total_threads=16
+Dec 10 14:01:25 launchpad ollama[362365]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36223" tid="140392264495104" timestamp=1733868085
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 14:01:25 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 14:01:25 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 14:01:25 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 14:01:25 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 14:01:26 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: time=2024-12-10T14:01:26.221-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 14:01:26 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 14:01:26 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 14:01:26 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 14:01:26 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 14:01:27 launchpad ollama[362365]: INFO [main] model loaded | tid="140392264495104" timestamp=1733868086
+Dec 10 14:01:27 launchpad ollama[1650]: time=2024-12-10T14:01:27.225-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 10 14:01:42 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:01:42 | 200 | 16.917501817s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.044-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.200-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.201-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.202-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 33171"
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.202-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.202-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.202-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 14:08:19 launchpad ollama[365838]: INFO [main] build info | build=0 commit="unknown" tid="140321980641280" timestamp=1733868499
+Dec 10 14:08:19 launchpad ollama[365838]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140321980641280" timestamp=1733868499 total_threads=16
+Dec 10 14:08:19 launchpad ollama[365838]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33171" tid="140321980641280" timestamp=1733868499
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 14:08:19 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 14:08:19 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 14:08:19 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 14:08:19 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.497-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 14:08:20 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 14:08:20 launchpad ollama[365838]: INFO [main] model loaded | tid="140321980641280" timestamp=1733868500
+Dec 10 14:08:20 launchpad ollama[1650]: time=2024-12-10T14:08:20.501-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 14:08:42 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:08:42 | 200 |  23.18110582s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:11:03 launchpad ollama[1650]: time=2024-12-10T14:11:03.977-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:11:17 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:11:17 | 200 | 13.750877455s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:12:44 launchpad ollama[1650]: time=2024-12-10T14:12:44.128-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:12:52 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:12:52 | 200 |  8.702631088s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.143-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.291-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.291-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.292-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 42585"
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.293-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.293-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.293-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 14:23:41 launchpad ollama[376789]: INFO [main] build info | build=0 commit="unknown" tid="139899278577664" timestamp=1733869421
+Dec 10 14:23:41 launchpad ollama[376789]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139899278577664" timestamp=1733869421 total_threads=16
+Dec 10 14:23:41 launchpad ollama[376789]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42585" tid="139899278577664" timestamp=1733869421
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 14:23:41 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 14:23:41 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 14:23:41 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 14:23:41 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.591-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 14:23:42 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 14:23:42 launchpad ollama[376789]: INFO [main] model loaded | tid="139899278577664" timestamp=1733869422
+Dec 10 14:23:42 launchpad ollama[1650]: time=2024-12-10T14:23:42.595-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 14:24:03 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:24:03 | 200 | 22.165994735s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:33:56 launchpad ollama[1650]: time=2024-12-10T14:33:56.981-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.136-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.136-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.138-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 40473"
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.138-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.138-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.138-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 14:33:57 launchpad ollama[381767]: INFO [main] build info | build=0 commit="unknown" tid="140049001594880" timestamp=1733870037
+Dec 10 14:33:57 launchpad ollama[381767]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140049001594880" timestamp=1733870037 total_threads=16
+Dec 10 14:33:57 launchpad ollama[381767]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40473" tid="140049001594880" timestamp=1733870037
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 14:33:57 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 14:33:57 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 14:33:57 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 14:33:57 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.429-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 14:33:58 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 14:33:58 launchpad ollama[381767]: INFO [main] model loaded | tid="140049001594880" timestamp=1733870038
+Dec 10 14:33:58 launchpad ollama[1650]: time=2024-12-10T14:33:58.434-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 14:34:18 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:34:18 | 200 | 21.837078215s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:38:39 launchpad ollama[1650]: time=2024-12-10T14:38:39.815-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:38:56 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:38:56 | 200 | 16.692629007s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.450-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.607-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.608-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.609-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 33501"
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.609-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.609-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.610-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 14:47:32 launchpad ollama[390600]: INFO [main] build info | build=0 commit="unknown" tid="140705561300992" timestamp=1733870852
+Dec 10 14:47:32 launchpad ollama[390600]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140705561300992" timestamp=1733870852 total_threads=16
+Dec 10 14:47:32 launchpad ollama[390600]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33501" tid="140705561300992" timestamp=1733870852
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 14:47:32 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 14:47:32 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 14:47:32 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 14:47:32 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.906-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 14:47:33 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 14:47:33 launchpad ollama[390600]: INFO [main] model loaded | tid="140705561300992" timestamp=1733870853
+Dec 10 14:47:33 launchpad ollama[1650]: time=2024-12-10T14:47:33.909-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 14:47:45 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:47:45 | 200 | 13.391795289s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:50:18 launchpad ollama[1650]: time=2024-12-10T14:50:18.729-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:50:43 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:50:43 | 200 | 24.817309058s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:51:57 launchpad ollama[1650]: time=2024-12-10T14:51:57.396-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:52:13 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:52:13 | 200 | 16.206639651s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:54:44 launchpad ollama[1650]: time=2024-12-10T14:54:44.484-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:55:02 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:55:02 | 200 | 17.705412297s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:55:40 launchpad ollama[1650]: time=2024-12-10T14:55:40.893-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:56:07 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:56:07 | 200 | 26.230730716s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:57:00 launchpad ollama[1650]: time=2024-12-10T14:57:00.005-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:57:18 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:57:18 | 200 | 18.065191785s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.479-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.630-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.631-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.632-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 34683"
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.632-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.632-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.632-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 15:16:13 launchpad ollama[417545]: INFO [main] build info | build=0 commit="unknown" tid="140274761580544" timestamp=1733872573
+Dec 10 15:16:13 launchpad ollama[417545]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140274761580544" timestamp=1733872573 total_threads=16
+Dec 10 15:16:13 launchpad ollama[417545]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34683" tid="140274761580544" timestamp=1733872573
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 15:16:13 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 15:16:13 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 15:16:13 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 15:16:13 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.925-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 15:16:14 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 15:16:14 launchpad ollama[417545]: INFO [main] model loaded | tid="140274761580544" timestamp=1733872574
+Dec 10 15:16:14 launchpad ollama[1650]: time=2024-12-10T15:16:14.928-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 15:16:28 launchpad ollama[1650]: [GIN] 2024/12/10 - 15:16:28 | 200 | 14.648835449s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 16:57:39 launchpad ollama[1650]: [GIN] 2024/12/10 - 16:57:39 | 404 |     128.842µs |       127.0.0.1 | POST     "/api/generate"
+Dec 10 16:57:55 launchpad ollama[1650]: [GIN] 2024/12/10 - 16:57:55 | 200 |      15.651µs |       127.0.0.1 | HEAD     "/"
+Dec 10 16:57:55 launchpad ollama[1650]: [GIN] 2024/12/10 - 16:57:55 | 200 |   18.644249ms |       127.0.0.1 | POST     "/api/show"
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.464-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9508159488 required="6.2 GiB"
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.464-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.464-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.465-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33643"
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.465-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.465-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.466-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 16:57:55 launchpad ollama[421866]: INFO [main] build info | build=0 commit="unknown" tid="140253495021568" timestamp=1733878675
+Dec 10 16:57:55 launchpad ollama[421866]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140253495021568" timestamp=1733878675 total_threads=16
+Dec 10 16:57:55 launchpad ollama[421866]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33643" tid="140253495021568" timestamp=1733878675
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.717-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 16:57:55 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 16:57:55 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 16:57:55 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 16:57:55 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 16:58:00 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 16:58:00 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 16:58:00 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 16:58:00 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 16:58:00 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 16:58:01 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 16:58:01 launchpad ollama[421866]: INFO [main] model loaded | tid="140253495021568" timestamp=1733878681
+Dec 10 16:58:01 launchpad ollama[1650]: time=2024-12-10T16:58:01.488-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.02 seconds"
+Dec 10 16:58:01 launchpad ollama[1650]: [GIN] 2024/12/10 - 16:58:01 | 200 |  6.198144851s |       127.0.0.1 | POST     "/api/generate"
+Dec 10 16:58:14 launchpad ollama[1650]: [GIN] 2024/12/10 - 16:58:14 | 200 |  786.154967ms |       127.0.0.1 | POST     "/api/chat"
+Dec 10 16:58:47 launchpad ollama[1650]: [GIN] 2024/12/10 - 16:58:47 | 200 |  1.297004736s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 17:00:38 launchpad ollama[1650]: [GIN] 2024/12/10 - 17:00:38 | 200 |  3.489483373s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.056-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9612296192 required="6.2 GiB"
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.056-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.057-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.058-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34813"
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.058-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.058-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.058-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 17:09:11 launchpad ollama[421932]: INFO [main] build info | build=0 commit="unknown" tid="140077366861824" timestamp=1733879351
+Dec 10 17:09:11 launchpad ollama[421932]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140077366861824" timestamp=1733879351 total_threads=16
+Dec 10 17:09:11 launchpad ollama[421932]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34813" tid="140077366861824" timestamp=1733879351
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.309-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 17:09:11 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 17:09:11 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 17:09:11 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 17:09:11 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 17:09:11 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 17:09:12 launchpad ollama[421932]: INFO [main] model loaded | tid="140077366861824" timestamp=1733879352
+Dec 10 17:09:12 launchpad ollama[1650]: time=2024-12-10T17:09:12.062-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 10 17:09:20 launchpad ollama[1650]: [GIN] 2024/12/10 - 17:09:20 | 200 |  9.888046721s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.764-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9612230656 required="6.2 GiB"
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.765-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.765-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.766-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34009"
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.767-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.767-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.767-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 17:19:08 launchpad ollama[421988]: INFO [main] build info | build=0 commit="unknown" tid="140584350109696" timestamp=1733879948
+Dec 10 17:19:08 launchpad ollama[421988]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140584350109696" timestamp=1733879948 total_threads=16
+Dec 10 17:19:08 launchpad ollama[421988]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34009" tid="140584350109696" timestamp=1733879948
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 17:19:09 launchpad ollama[1650]: time=2024-12-10T17:19:09.018-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 17:19:09 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 17:19:09 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 17:19:09 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 17:19:09 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 17:19:09 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 17:19:09 launchpad ollama[421988]: INFO [main] model loaded | tid="140584350109696" timestamp=1733879949
+Dec 10 17:19:10 launchpad ollama[1650]: time=2024-12-10T17:19:10.021-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Dec 10 17:19:18 launchpad ollama[1650]: [GIN] 2024/12/10 - 17:19:18 | 200 | 10.018811403s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.305-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9612296192 required="6.2 GiB"
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.305-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.305-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.306-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35857"
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.307-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.307-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.307-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 17:30:50 launchpad ollama[422045]: INFO [main] build info | build=0 commit="unknown" tid="140647704125440" timestamp=1733880650
+Dec 10 17:30:50 launchpad ollama[422045]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140647704125440" timestamp=1733880650 total_threads=16
+Dec 10 17:30:50 launchpad ollama[422045]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35857" tid="140647704125440" timestamp=1733880650
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.558-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 17:30:50 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 17:30:50 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 17:30:50 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 17:30:50 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 17:30:51 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 17:30:51 launchpad ollama[422045]: INFO [main] model loaded | tid="140647704125440" timestamp=1733880651
+Dec 10 17:30:51 launchpad ollama[1650]: time=2024-12-10T17:30:51.561-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Dec 10 17:31:01 launchpad ollama[1650]: [GIN] 2024/12/10 - 17:31:01 | 200 | 11.723588665s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.989-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9612296192 required="6.2 GiB"
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.989-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.989-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.990-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41169"
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.990-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.990-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.991-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 17:49:07 launchpad ollama[422099]: INFO [main] build info | build=0 commit="unknown" tid="140321830719488" timestamp=1733881747
+Dec 10 17:49:07 launchpad ollama[422099]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140321830719488" timestamp=1733881747 total_threads=16
+Dec 10 17:49:07 launchpad ollama[422099]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41169" tid="140321830719488" timestamp=1733881747
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 17:49:07 launchpad ollama[1650]: time=2024-12-10T17:49:07.241-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 17:49:07 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 17:49:07 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 17:49:07 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 17:49:07 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 17:49:07 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 17:49:08 launchpad ollama[422099]: INFO [main] model loaded | tid="140321830719488" timestamp=1733881748
+Dec 10 17:49:08 launchpad ollama[1650]: time=2024-12-10T17:49:08.246-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Dec 10 17:49:18 launchpad ollama[1650]: [GIN] 2024/12/10 - 17:49:18 | 200 | 11.847662115s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.829-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9612296192 required="6.2 GiB"
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.829-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.830-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.831-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34709"
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.831-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.831-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.831-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 17:56:01 launchpad ollama[422136]: INFO [main] build info | build=0 commit="unknown" tid="140308399419392" timestamp=1733882161
+Dec 10 17:56:01 launchpad ollama[422136]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140308399419392" timestamp=1733882161 total_threads=16
+Dec 10 17:56:01 launchpad ollama[422136]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34709" tid="140308399419392" timestamp=1733882161
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 17:56:02 launchpad ollama[1650]: time=2024-12-10T17:56:02.082-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 17:56:02 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 17:56:02 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 17:56:02 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 17:56:02 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 17:56:02 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 17:56:02 launchpad ollama[422136]: INFO [main] model loaded | tid="140308399419392" timestamp=1733882162
+Dec 10 17:56:02 launchpad ollama[1650]: time=2024-12-10T17:56:02.836-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 10 17:56:09 launchpad ollama[1650]: [GIN] 2024/12/10 - 17:56:09 | 200 |  7.935606954s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.125-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9612296192 required="6.2 GiB"
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.125-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.125-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.126-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44159"
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.127-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.127-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.127-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 18:01:36 launchpad ollama[422185]: INFO [main] build info | build=0 commit="unknown" tid="139812718727168" timestamp=1733882496
+Dec 10 18:01:36 launchpad ollama[422185]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139812718727168" timestamp=1733882496 total_threads=16
+Dec 10 18:01:36 launchpad ollama[422185]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44159" tid="139812718727168" timestamp=1733882496
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.378-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 18:01:36 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 18:01:36 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 18:01:36 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 18:01:36 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 18:01:37 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 18:01:37 launchpad ollama[422185]: INFO [main] model loaded | tid="139812718727168" timestamp=1733882497
+Dec 10 18:01:37 launchpad ollama[1650]: time=2024-12-10T18:01:37.381-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Dec 10 18:01:44 launchpad ollama[1650]: [GIN] 2024/12/10 - 18:01:44 | 200 |  8.499003961s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.534-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9499705344 required="6.2 GiB"
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.534-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.534-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.535-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44585"
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.535-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.535-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.535-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 18:11:13 launchpad ollama[422251]: INFO [main] build info | build=0 commit="unknown" tid="140556930772992" timestamp=1733883073
+Dec 10 18:11:13 launchpad ollama[422251]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140556930772992" timestamp=1733883073 total_threads=16
+Dec 10 18:11:13 launchpad ollama[422251]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44585" tid="140556930772992" timestamp=1733883073
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.786-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 18:11:13 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 18:11:13 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 18:11:13 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 18:11:13 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 18:11:14 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 18:11:14 launchpad ollama[422251]: INFO [main] model loaded | tid="140556930772992" timestamp=1733883074
+Dec 10 18:11:14 launchpad ollama[1650]: time=2024-12-10T18:11:14.540-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 10 18:11:22 launchpad ollama[1650]: [GIN] 2024/12/10 - 18:11:22 | 200 |  9.319415241s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 18:13:14 launchpad ollama[1650]: [GIN] 2024/12/10 - 18:13:14 | 200 |  3.260898825s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 18:15:42 launchpad ollama[1650]: [GIN] 2024/12/10 - 18:15:42 | 200 |  1.883067966s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.441-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9483714560 required="6.2 GiB"
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.441-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.1 GiB" free_swap="68.9 GiB"
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.442-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.443-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37249"
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.443-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.443-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.443-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 11 07:43:26 launchpad ollama[423071]: INFO [main] build info | build=0 commit="unknown" tid="140625205911552" timestamp=1733931806
+Dec 11 07:43:26 launchpad ollama[423071]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140625205911552" timestamp=1733931806 total_threads=16
+Dec 11 07:43:26 launchpad ollama[423071]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37249" tid="140625205911552" timestamp=1733931806
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.694-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 11 07:43:26 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 11 07:43:26 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 11 07:43:26 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 11 07:43:26 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 11 07:43:27 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 11 07:43:27 launchpad ollama[423071]: INFO [main] model loaded | tid="140625205911552" timestamp=1733931807
+Dec 11 07:43:27 launchpad ollama[1650]: time=2024-12-11T07:43:27.447-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 11 07:43:34 launchpad ollama[1650]: [GIN] 2024/12/11 - 07:43:34 | 200 |   8.43094446s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 07:45:23 launchpad ollama[1650]: [GIN] 2024/12/11 - 07:45:23 | 200 |  7.365034468s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 07:46:07 launchpad ollama[1650]: [GIN] 2024/12/11 - 07:46:07 | 200 |  4.253881663s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 07:48:08 launchpad ollama[1650]: [GIN] 2024/12/11 - 07:48:08 | 200 |  2.783468333s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 13:43:30 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:43:30 | 200 |       15.84µs |       127.0.0.1 | HEAD     "/"
+Dec 11 13:43:30 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:43:30 | 200 |    3.396496ms |       127.0.0.1 | POST     "/api/show"
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.645-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.787-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.6 GiB" free_swap="68.9 GiB"
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.788-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.789-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 35235"
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.789-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.789-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.790-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 11 13:43:30 launchpad ollama[431032]: INFO [main] build info | build=0 commit="unknown" tid="140148342874112" timestamp=1733953410
+Dec 11 13:43:30 launchpad ollama[431032]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140148342874112" timestamp=1733953410 total_threads=16
+Dec 11 13:43:30 launchpad ollama[431032]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35235" tid="140148342874112" timestamp=1733953410
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 11 13:43:30 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 11 13:43:30 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 11 13:43:30 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 11 13:43:30 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: time=2024-12-11T13:43:31.082-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 11 13:43:31 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 11 13:43:31 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 11 13:43:31 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 11 13:43:31 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 11 13:43:31 launchpad ollama[431032]: INFO [main] model loaded | tid="140148342874112" timestamp=1733953411
+Dec 11 13:43:32 launchpad ollama[1650]: time=2024-12-11T13:43:32.086-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 11 13:43:32 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:43:32 | 200 |  1.444791055s |       127.0.0.1 | POST     "/api/generate"
+Dec 11 13:46:44 launchpad ollama[1650]: time=2024-12-11T13:46:44.674-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 13:46:51 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:46:51 | 200 |  6.888345196s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.248-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.404-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.6 GiB" free_swap="68.9 GiB"
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.404-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.405-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 42881"
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.405-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.405-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.405-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 11 13:53:59 launchpad ollama[433241]: INFO [main] build info | build=0 commit="unknown" tid="139892779831296" timestamp=1733954039
+Dec 11 13:53:59 launchpad ollama[433241]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139892779831296" timestamp=1733954039 total_threads=16
+Dec 11 13:53:59 launchpad ollama[433241]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42881" tid="139892779831296" timestamp=1733954039
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 11 13:53:59 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 11 13:53:59 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 11 13:53:59 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 11 13:53:59 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.694-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 11 13:54:00 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 11 13:54:00 launchpad ollama[433241]: INFO [main] model loaded | tid="139892779831296" timestamp=1733954040
+Dec 11 13:54:00 launchpad ollama[1650]: time=2024-12-11T13:54:00.698-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 11 13:54:05 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:54:05 | 200 |  6.126193598s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 13:55:26 launchpad ollama[1650]: time=2024-12-11T13:55:26.775-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 13:55:54 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:55:54 | 200 | 27.614988181s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 13:57:52 launchpad ollama[1650]: time=2024-12-11T13:57:52.401-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 13:58:12 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:58:12 | 200 | 20.397791898s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 14:00:28 launchpad ollama[1650]: time=2024-12-11T14:00:28.893-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 14:00:34 launchpad ollama[1650]: [GIN] 2024/12/11 - 14:00:34 | 200 |  5.768630267s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 14:01:48 launchpad ollama[1650]: time=2024-12-11T14:01:48.180-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 14:01:52 launchpad ollama[1650]: [GIN] 2024/12/11 - 14:01:52 | 200 |  4.009592784s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.620-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.785-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.6 GiB" free_swap="68.9 GiB"
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.786-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.787-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 36761"
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.787-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.787-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.787-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 11 14:10:11 launchpad ollama[451071]: INFO [main] build info | build=0 commit="unknown" tid="140105837682688" timestamp=1733955011
+Dec 11 14:10:11 launchpad ollama[451071]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140105837682688" timestamp=1733955011 total_threads=16
+Dec 11 14:10:11 launchpad ollama[451071]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36761" tid="140105837682688" timestamp=1733955011
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 11 14:10:11 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 11 14:10:11 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 11 14:10:11 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 11 14:10:11 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: time=2024-12-11T14:10:12.090-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 11 14:10:12 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 11 14:10:12 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 11 14:10:12 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 11 14:10:12 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 11 14:10:12 launchpad ollama[451071]: INFO [main] model loaded | tid="140105837682688" timestamp=1733955012
+Dec 11 14:10:13 launchpad ollama[1650]: time=2024-12-11T14:10:13.093-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Dec 11 14:10:27 launchpad ollama[1650]: [GIN] 2024/12/11 - 14:10:27 | 200 | 15.651452873s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 14:11:53 launchpad ollama[1650]: time=2024-12-11T14:11:53.390-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 14:12:01 launchpad ollama[1650]: [GIN] 2024/12/11 - 14:12:01 | 200 |  8.580213294s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.486-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.636-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.5 GiB" free_swap="68.9 GiB"
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.637-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.638-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 45319"
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.638-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.638-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.638-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 11 14:29:07 launchpad ollama[455431]: INFO [main] build info | build=0 commit="unknown" tid="140361982791680" timestamp=1733956147
+Dec 11 14:29:07 launchpad ollama[455431]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140361982791680" timestamp=1733956147 total_threads=16
+Dec 11 14:29:07 launchpad ollama[455431]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45319" tid="140361982791680" timestamp=1733956147
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 11 14:29:07 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 11 14:29:07 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 11 14:29:07 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 11 14:29:07 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.938-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 11 14:29:08 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 11 14:29:08 launchpad ollama[455431]: INFO [main] model loaded | tid="140361982791680" timestamp=1733956148
+Dec 11 14:29:08 launchpad ollama[1650]: time=2024-12-11T14:29:08.942-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 11 14:30:20 launchpad ollama[1650]: [GIN] 2024/12/11 - 14:30:20 | 200 |         1m13s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.445-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.603-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.5 GiB" free_swap="68.9 GiB"
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.604-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.605-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 43521"
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.605-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.605-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.605-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 11 15:41:49 launchpad ollama[474523]: INFO [main] build info | build=0 commit="unknown" tid="140609861214208" timestamp=1733960509
+Dec 11 15:41:49 launchpad ollama[474523]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140609861214208" timestamp=1733960509 total_threads=16
+Dec 11 15:41:49 launchpad ollama[474523]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43521" tid="140609861214208" timestamp=1733960509
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 11 15:41:49 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 11 15:41:49 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 11 15:41:49 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 11 15:41:49 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.903-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 11 15:41:50 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 11 15:41:50 launchpad ollama[474523]: INFO [main] model loaded | tid="140609861214208" timestamp=1733960510
+Dec 11 15:41:50 launchpad ollama[1650]: time=2024-12-11T15:41:50.907-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 11 15:42:04 launchpad ollama[1650]: [GIN] 2024/12/11 - 15:42:04 | 200 | 14.977727541s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 15:42:32 launchpad ollama[1650]: time=2024-12-11T15:42:32.466-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 15:42:57 launchpad ollama[1650]: [GIN] 2024/12/11 - 15:42:57 | 200 | 24.726327701s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.782-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.923-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.923-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.924-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 46625"
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.924-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.924-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.924-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 10:38:15 launchpad ollama[525723]: INFO [main] build info | build=0 commit="unknown" tid="140242544910336" timestamp=1734028695
+Dec 12 10:38:15 launchpad ollama[525723]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140242544910336" timestamp=1734028695 total_threads=16
+Dec 12 10:38:15 launchpad ollama[525723]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46625" tid="140242544910336" timestamp=1734028695
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 10:38:15 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 10:38:15 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 10:38:15 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 10:38:15 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 10:38:16 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: time=2024-12-12T10:38:16.227-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 10:38:16 launchpad ollama[1650]: llm_load_tensors: offloading 39 repeating layers to GPU
+Dec 12 10:38:16 launchpad ollama[1650]: llm_load_tensors: offloaded 39/41 layers to GPU
+Dec 12 10:38:16 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 10:38:16 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 15
+Dec 12 10:38:17 launchpad ollama[525723]: INFO [main] model loaded | tid="140242544910336" timestamp=1734028697
+Dec 12 10:38:17 launchpad ollama[1650]: time=2024-12-12T10:38:17.230-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Dec 12 10:38:33 launchpad ollama[1650]: [GIN] 2024/12/12 - 10:38:33 | 200 | 18.154729699s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 10:39:27 launchpad ollama[1650]: time=2024-12-12T10:39:27.181-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 10:39:38 launchpad ollama[1650]: [GIN] 2024/12/12 - 10:39:38 | 200 | 11.712285197s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.677-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.822-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.3 GiB" free_swap="68.9 GiB"
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.823-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.824-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 41259"
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.824-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.824-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.824-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 14:07:12 launchpad ollama[537723]: INFO [main] build info | build=0 commit="unknown" tid="140266838138880" timestamp=1734041232
+Dec 12 14:07:12 launchpad ollama[537723]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140266838138880" timestamp=1734041232 total_threads=16
+Dec 12 14:07:12 launchpad ollama[537723]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41259" tid="140266838138880" timestamp=1734041232
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 14:07:12 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 14:07:12 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 14:07:12 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 14:07:12 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: time=2024-12-12T14:07:13.128-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 14:07:13 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 14:07:13 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 14:07:13 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 14:07:13 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 14:07:13 launchpad ollama[537723]: INFO [main] model loaded | tid="140266838138880" timestamp=1734041233
+Dec 12 14:07:14 launchpad ollama[1650]: time=2024-12-12T14:07:14.131-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Dec 12 14:07:31 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:07:31 | 200 | 18.419763856s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:09:22 launchpad ollama[1650]: time=2024-12-12T14:09:22.170-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:09:33 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:09:33 | 200 | 11.519442637s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:23:20 launchpad ollama[1650]: time=2024-12-12T14:23:20.894-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.040-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.040-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.041-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 34581"
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.041-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.041-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.042-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 14:23:21 launchpad ollama[544424]: INFO [main] build info | build=0 commit="unknown" tid="139623722336256" timestamp=1734042201
+Dec 12 14:23:21 launchpad ollama[544424]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139623722336256" timestamp=1734042201 total_threads=16
+Dec 12 14:23:21 launchpad ollama[544424]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34581" tid="139623722336256" timestamp=1734042201
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 14:23:21 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 14:23:21 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 14:23:21 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 14:23:21 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.339-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 14:23:21 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 14:23:21 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 14:23:21 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 14:23:21 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 14:23:21 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 14:23:22 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 14:23:22 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 14:23:22 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 14:23:22 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 14:23:22 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 14:23:22 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 14:23:22 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 14:23:22 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 14:23:22 launchpad ollama[544424]: INFO [main] model loaded | tid="139623722336256" timestamp=1734042202
+Dec 12 14:23:22 launchpad ollama[1650]: time=2024-12-12T14:23:22.343-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 12 14:23:32 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:23:32 | 200 | 11.464269514s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.797-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.945-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.946-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.947-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 34773"
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.947-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.947-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.947-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 14:30:38 launchpad ollama[546598]: INFO [main] build info | build=0 commit="unknown" tid="140561075200000" timestamp=1734042638
+Dec 12 14:30:38 launchpad ollama[546598]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140561075200000" timestamp=1734042638 total_threads=16
+Dec 12 14:30:38 launchpad ollama[546598]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34773" tid="140561075200000" timestamp=1734042638
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 14:30:38 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 14:30:38 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 14:30:38 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 14:30:38 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 14:30:39 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: time=2024-12-12T14:30:39.239-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 14:30:39 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 14:30:39 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 14:30:39 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 14:30:39 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 14:30:40 launchpad ollama[546598]: INFO [main] model loaded | tid="140561075200000" timestamp=1734042640
+Dec 12 14:30:40 launchpad ollama[1650]: time=2024-12-12T14:30:40.243-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 12 14:31:02 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:31:02 | 200 | 24.058789556s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:32:38 launchpad ollama[1650]: time=2024-12-12T14:32:38.995-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:33:03 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:33:03 | 200 | 24.233151055s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.089-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.247-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.247-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.248-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 33447"
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.248-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.248-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.248-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 14:38:17 launchpad ollama[557894]: INFO [main] build info | build=0 commit="unknown" tid="139713601077248" timestamp=1734043097
+Dec 12 14:38:17 launchpad ollama[557894]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139713601077248" timestamp=1734043097 total_threads=16
+Dec 12 14:38:17 launchpad ollama[557894]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33447" tid="139713601077248" timestamp=1734043097
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 14:38:17 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 14:38:17 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 14:38:17 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 14:38:17 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.536-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 14:38:18 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 14:38:18 launchpad ollama[557894]: INFO [main] model loaded | tid="139713601077248" timestamp=1734043098
+Dec 12 14:38:18 launchpad ollama[1650]: time=2024-12-12T14:38:18.541-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 12 14:38:33 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:38:33 | 200 | 16.183629953s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:42:33 launchpad ollama[1650]: time=2024-12-12T14:42:33.257-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:42:49 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:42:49 | 200 | 16.693109504s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:43:23 launchpad ollama[1650]: time=2024-12-12T14:43:23.051-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:43:40 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:43:40 | 200 | 17.400904173s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:46:14 launchpad ollama[1650]: time=2024-12-12T14:46:14.550-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:46:26 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:46:26 | 200 |   12.2102709s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:48:06 launchpad ollama[1650]: time=2024-12-12T14:48:06.806-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:48:25 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:48:25 | 200 | 18.215535691s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.506-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.659-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.659-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.660-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 44653"
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.660-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.660-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.661-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 14:56:52 launchpad ollama[575346]: INFO [main] build info | build=0 commit="unknown" tid="140114165587968" timestamp=1734044212
+Dec 12 14:56:52 launchpad ollama[575346]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140114165587968" timestamp=1734044212 total_threads=16
+Dec 12 14:56:52 launchpad ollama[575346]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44653" tid="140114165587968" timestamp=1734044212
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 14:56:52 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 14:56:52 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 14:56:52 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 14:56:52 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.954-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 14:56:53 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 14:56:53 launchpad ollama[575346]: INFO [main] model loaded | tid="140114165587968" timestamp=1734044213
+Dec 12 14:56:53 launchpad ollama[1650]: time=2024-12-12T14:56:53.958-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 12 14:57:40 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:57:40 | 200 | 47.735583444s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 15:00:42 launchpad ollama[1650]: time=2024-12-12T15:00:42.823-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 15:01:25 launchpad ollama[1650]: [GIN] 2024/12/12 - 15:01:25 | 200 | 42.737194334s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.290-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.440-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.440-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.441-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 35177"
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.442-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.442-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.442-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 15:54:20 launchpad ollama[599855]: INFO [main] build info | build=0 commit="unknown" tid="140033460981760" timestamp=1734047660
+Dec 12 15:54:20 launchpad ollama[599855]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140033460981760" timestamp=1734047660 total_threads=16
+Dec 12 15:54:20 launchpad ollama[599855]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35177" tid="140033460981760" timestamp=1734047660
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 15:54:20 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 15:54:20 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 15:54:20 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 15:54:20 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.732-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 15:54:21 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 15:54:21 launchpad ollama[599855]: INFO [main] model loaded | tid="140033460981760" timestamp=1734047661
+Dec 12 15:54:21 launchpad ollama[1650]: time=2024-12-12T15:54:21.736-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 12 15:54:46 launchpad ollama[1650]: [GIN] 2024/12/12 - 15:54:46 | 200 | 26.277009717s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.303-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.453-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.453-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.454-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 34969"
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.454-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.454-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.454-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 16:12:08 launchpad ollama[607541]: INFO [main] build info | build=0 commit="unknown" tid="140453608828928" timestamp=1734048728
+Dec 12 16:12:08 launchpad ollama[607541]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140453608828928" timestamp=1734048728 total_threads=16
+Dec 12 16:12:08 launchpad ollama[607541]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34969" tid="140453608828928" timestamp=1734048728
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 16:12:08 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 16:12:08 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 16:12:08 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 16:12:08 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.752-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 16:12:09 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 16:12:09 launchpad ollama[607541]: INFO [main] model loaded | tid="140453608828928" timestamp=1734048729
+Dec 12 16:12:09 launchpad ollama[1650]: time=2024-12-12T16:12:09.756-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 12 16:12:22 launchpad ollama[1650]: [GIN] 2024/12/12 - 16:12:22 | 200 | 14.324179286s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 16:13:13 launchpad ollama[1650]: time=2024-12-12T16:13:13.285-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 16:13:22 launchpad ollama[1650]: [GIN] 2024/12/12 - 16:13:22 | 200 |  9.382966715s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.810-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.975-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.7 GiB" free_swap="68.9 GiB"
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.975-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.977-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 43449"
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.977-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.977-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.977-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 16:37:29 launchpad ollama[615372]: INFO [main] build info | build=0 commit="unknown" tid="140152331472896" timestamp=1734050249
+Dec 12 16:37:29 launchpad ollama[615372]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140152331472896" timestamp=1734050249 total_threads=16
+Dec 12 16:37:29 launchpad ollama[615372]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43449" tid="140152331472896" timestamp=1734050249
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 16:37:29 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 16:37:29 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 16:37:29 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 16:37:29 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: time=2024-12-12T16:37:29.283-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 16:37:29 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 16:37:30 launchpad ollama[615372]: INFO [main] model loaded | tid="140152331472896" timestamp=1734050250
+Dec 12 16:37:30 launchpad ollama[1650]: time=2024-12-12T16:37:30.286-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Dec 12 16:37:47 launchpad ollama[1650]: [GIN] 2024/12/12 - 16:37:47 | 200 | 18.254656316s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 15:56:33 launchpad ollama[1650]: [GIN] 2024/12/13 - 15:56:33 | 200 |      16.283µs |       127.0.0.1 | HEAD     "/"
+Dec 13 15:56:33 launchpad ollama[1650]: [GIN] 2024/12/13 - 15:56:33 | 200 |   12.805255ms |       127.0.0.1 | POST     "/api/show"
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.314-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9348120576 required="6.2 GiB"
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.314-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.314-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.315-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44243"
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.316-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.316-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.316-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 13 15:56:33 launchpad ollama[636600]: INFO [main] build info | build=0 commit="unknown" tid="139786189860864" timestamp=1734134193
+Dec 13 15:56:33 launchpad ollama[636600]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139786189860864" timestamp=1734134193 total_threads=16
+Dec 13 15:56:33 launchpad ollama[636600]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44243" tid="139786189860864" timestamp=1734134193
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.566-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 13 15:56:33 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 13 15:56:33 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 13 15:56:33 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 13 15:56:33 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 13 15:56:34 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 13 15:56:34 launchpad ollama[636600]: INFO [main] model loaded | tid="139786189860864" timestamp=1734134194
+Dec 13 15:56:34 launchpad ollama[1650]: time=2024-12-13T15:56:34.320-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 13 15:56:34 launchpad ollama[1650]: [GIN] 2024/12/13 - 15:56:34 | 200 |  1.171286506s |       127.0.0.1 | POST     "/api/generate"
+Dec 13 16:01:29 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:01:29 | 200 |  6.136587742s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:02:33 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:02:33 | 200 |  4.447422642s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:04:01 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:04:01 | 200 |  5.357672267s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.267-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9507045376 required="6.2 GiB"
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.268-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.269-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.270-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42889"
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.270-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.270-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.270-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 13 16:11:15 launchpad ollama[636840]: INFO [main] build info | build=0 commit="unknown" tid="140604394651648" timestamp=1734135075
+Dec 13 16:11:15 launchpad ollama[636840]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140604394651648" timestamp=1734135075 total_threads=16
+Dec 13 16:11:15 launchpad ollama[636840]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42889" tid="140604394651648" timestamp=1734135075
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.521-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 13 16:11:15 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 13 16:11:15 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 13 16:11:15 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 13 16:11:15 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 13 16:11:16 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 13 16:11:16 launchpad ollama[636840]: INFO [main] model loaded | tid="140604394651648" timestamp=1734135076
+Dec 13 16:11:16 launchpad ollama[1650]: time=2024-12-13T16:11:16.274-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 13 16:11:23 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:11:23 | 200 |  8.898060255s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:15:11 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:15:11 | 200 |  6.804123041s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.205-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9519038464 required="6.2 GiB"
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.205-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.1 GiB" free_swap="68.9 GiB"
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.205-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.206-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39921"
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.206-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.206-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.207-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 13 16:33:55 launchpad ollama[637115]: INFO [main] build info | build=0 commit="unknown" tid="139850658385920" timestamp=1734136435
+Dec 13 16:33:55 launchpad ollama[637115]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139850658385920" timestamp=1734136435 total_threads=16
+Dec 13 16:33:55 launchpad ollama[637115]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39921" tid="139850658385920" timestamp=1734136435
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.458-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 13 16:33:55 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 13 16:33:55 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 13 16:33:55 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 13 16:33:55 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 13 16:33:56 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 13 16:33:56 launchpad ollama[637115]: INFO [main] model loaded | tid="139850658385920" timestamp=1734136436
+Dec 13 16:33:56 launchpad ollama[1650]: time=2024-12-13T16:33:56.210-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 13 16:34:02 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:34:02 | 200 |  7.432932971s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.720-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9371385856 required="6.2 GiB"
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.720-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.721-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.722-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36199"
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.722-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.722-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.722-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 13 16:42:16 launchpad ollama[637303]: INFO [main] build info | build=0 commit="unknown" tid="140314999214080" timestamp=1734136936
+Dec 13 16:42:16 launchpad ollama[637303]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140314999214080" timestamp=1734136936 total_threads=16
+Dec 13 16:42:16 launchpad ollama[637303]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36199" tid="140314999214080" timestamp=1734136936
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.973-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 13 16:42:17 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 13 16:42:17 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 13 16:42:17 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 13 16:42:17 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 13 16:42:17 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 13 16:42:17 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 13 16:42:17 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 13 16:42:17 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 13 16:42:17 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 13 16:42:17 launchpad ollama[637303]: INFO [main] model loaded | tid="140314999214080" timestamp=1734136937
+Dec 13 16:42:17 launchpad ollama[1650]: time=2024-12-13T16:42:17.726-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 13 16:42:22 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:42:22 | 200 |  5.898577155s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:43:36 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:43:36 | 200 |  2.113112404s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:47:06 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:47:06 | 200 |  3.491798481s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:49:11 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:49:11 | 200 |  878.893601ms |       127.0.0.1 | POST     "/api/chat"
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.541-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8989310976 required="6.2 GiB"
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.541-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.542-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.543-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38123"
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.543-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.543-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.543-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 14 08:42:09 launchpad ollama[642606]: INFO [main] build info | build=0 commit="unknown" tid="140598098186240" timestamp=1734194529
+Dec 14 08:42:09 launchpad ollama[642606]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140598098186240" timestamp=1734194529 total_threads=16
+Dec 14 08:42:09 launchpad ollama[642606]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38123" tid="140598098186240" timestamp=1734194529
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.794-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 14 08:42:09 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 14 08:42:09 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 14 08:42:09 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 14 08:42:09 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 14 08:42:10 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 14 08:42:10 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 14 08:42:10 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 14 08:42:10 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 14 08:42:10 launchpad ollama[642606]: INFO [main] model loaded | tid="140598098186240" timestamp=1734194530
+Dec 14 08:42:10 launchpad ollama[1650]: time=2024-12-14T08:42:10.547-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 14 08:42:26 launchpad ollama[1650]: [GIN] 2024/12/14 - 08:42:26 | 200 | 16.798782892s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.229-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8989638656 required="6.2 GiB"
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.229-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.230-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.231-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39261"
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.231-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.231-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.231-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 14 08:49:06 launchpad ollama[642642]: INFO [main] build info | build=0 commit="unknown" tid="140004482441216" timestamp=1734194946
+Dec 14 08:49:06 launchpad ollama[642642]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140004482441216" timestamp=1734194946 total_threads=16
+Dec 14 08:49:06 launchpad ollama[642642]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39261" tid="140004482441216" timestamp=1734194946
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.482-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 14 08:49:06 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 14 08:49:06 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 14 08:49:06 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 14 08:49:06 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 14 08:49:07 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 14 08:49:07 launchpad ollama[642642]: INFO [main] model loaded | tid="140004482441216" timestamp=1734194947
+Dec 14 08:49:07 launchpad ollama[1650]: time=2024-12-14T08:49:07.234-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 14 08:49:17 launchpad ollama[1650]: [GIN] 2024/12/14 - 08:49:17 | 200 | 11.443394435s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.155-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8989900800 required="6.2 GiB"
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.155-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.155-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.156-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40259"
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.156-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.156-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.157-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 14 09:00:53 launchpad ollama[642700]: INFO [main] build info | build=0 commit="unknown" tid="140141380591616" timestamp=1734195653
+Dec 14 09:00:53 launchpad ollama[642700]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140141380591616" timestamp=1734195653 total_threads=16
+Dec 14 09:00:53 launchpad ollama[642700]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40259" tid="140141380591616" timestamp=1734195653
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.408-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 14 09:00:53 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 14 09:00:53 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 14 09:00:53 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 14 09:00:53 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 14 09:00:54 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 14 09:00:54 launchpad ollama[642700]: INFO [main] model loaded | tid="140141380591616" timestamp=1734195654
+Dec 14 09:00:54 launchpad ollama[1650]: time=2024-12-14T09:00:54.161-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 14 09:01:04 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:01:04 | 200 | 11.281515021s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:05:25 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:05:25 | 200 | 10.312164917s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.228-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8989638656 required="6.2 GiB"
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.228-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.228-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.229-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46307"
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.229-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.229-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.229-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 14 09:14:08 launchpad ollama[642758]: INFO [main] build info | build=0 commit="unknown" tid="139857237069824" timestamp=1734196448
+Dec 14 09:14:08 launchpad ollama[642758]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139857237069824" timestamp=1734196448 total_threads=16
+Dec 14 09:14:08 launchpad ollama[642758]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46307" tid="139857237069824" timestamp=1734196448
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.481-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 14 09:14:08 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 14 09:14:08 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 14 09:14:08 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 14 09:14:08 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 14 09:14:09 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 14 09:14:09 launchpad ollama[642758]: INFO [main] model loaded | tid="139857237069824" timestamp=1734196449
+Dec 14 09:14:09 launchpad ollama[1650]: time=2024-12-14T09:14:09.234-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 14 09:14:18 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:14:18 | 200 | 10.829763262s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.571-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8841265152 required="6.2 GiB"
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.571-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.572-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.573-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36277"
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.573-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.573-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.573-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 14 09:21:32 launchpad ollama[643754]: INFO [main] build info | build=0 commit="unknown" tid="139682762309632" timestamp=1734196892
+Dec 14 09:21:32 launchpad ollama[643754]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139682762309632" timestamp=1734196892 total_threads=16
+Dec 14 09:21:32 launchpad ollama[643754]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36277" tid="139682762309632" timestamp=1734196892
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.824-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 14 09:21:32 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 14 09:21:32 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 14 09:21:32 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 14 09:21:32 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 14 09:21:33 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 14 09:21:33 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 14 09:21:33 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 14 09:21:33 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 14 09:21:33 launchpad ollama[643754]: INFO [main] model loaded | tid="139682762309632" timestamp=1734196893
+Dec 14 09:21:33 launchpad ollama[1650]: time=2024-12-14T09:21:33.576-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 14 09:21:40 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:21:40 | 200 |  7.608779732s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:22:59 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:22:59 | 200 |  6.126859969s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:25:00 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:25:00 | 200 |  4.930539019s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:28:02 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:28:02 | 200 |  4.348262508s |       127.0.0.1 | POST     "/api/chat"
+Dec 15 08:15:59 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 15 08:15:59 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 15 08:15:59 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 15 08:15:59 launchpad systemd[1]: ollama.service: Consumed 1h 49min 39.416s CPU time, 12.6G memory peak, 11.4G read from disk, 508.3M written to disk.
+-- Boot 7f8995ca71744c71aa779feae177374a --
+Dec 15 08:16:34 launchpad systemd[1]: Starting Server for local large language models...
+Dec 15 08:16:34 launchpad systemd[1]: Started Server for local large language models.
+Dec 15 08:16:34 launchpad ollama[1569]: 2024/12/15 08:16:34 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 15 08:16:34 launchpad ollama[1569]: time=2024-12-15T08:16:34.509-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 15 08:16:34 launchpad ollama[1569]: time=2024-12-15T08:16:34.513-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 15 08:16:34 launchpad ollama[1569]: time=2024-12-15T08:16:34.514-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 15 08:16:34 launchpad ollama[1569]: time=2024-12-15T08:16:34.515-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2435835559/runners
+Dec 15 08:16:37 launchpad ollama[1569]: time=2024-12-15T08:16:37.509-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 15 08:16:37 launchpad ollama[1569]: time=2024-12-15T08:16:37.509-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 15 08:16:37 launchpad ollama[1569]: time=2024-12-15T08:16:37.509-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:16:37 launchpad ollama[1569]: time=2024-12-15T08:16:37.509-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:16:37 launchpad ollama[1569]: time=2024-12-15T08:16:37.509-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:16:37 launchpad ollama[1569]: time=2024-12-15T08:16:37.721-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 15 08:22:59 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 15 08:23:00 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 15 08:23:00 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 15 08:23:00 launchpad systemd[1]: ollama.service: Consumed 3.444s CPU time, 790.5M memory peak, 233.9M read from disk, 508.1M written to disk.
+-- Boot 587ab3b258734cc5b4c9607b461135f6 --
+Dec 15 08:23:32 launchpad systemd[1]: Starting Server for local large language models...
+Dec 15 08:23:32 launchpad systemd[1]: Started Server for local large language models.
+Dec 15 08:23:32 launchpad ollama[1568]: 2024/12/15 08:23:32 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 15 08:23:32 launchpad ollama[1568]: time=2024-12-15T08:23:32.476-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 15 08:23:32 launchpad ollama[1568]: time=2024-12-15T08:23:32.479-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 15 08:23:32 launchpad ollama[1568]: time=2024-12-15T08:23:32.480-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 15 08:23:32 launchpad ollama[1568]: time=2024-12-15T08:23:32.482-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3123740476/runners
+Dec 15 08:23:35 launchpad ollama[1568]: time=2024-12-15T08:23:35.414-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Dec 15 08:23:35 launchpad ollama[1568]: time=2024-12-15T08:23:35.414-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 15 08:23:35 launchpad ollama[1568]: time=2024-12-15T08:23:35.414-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:23:35 launchpad ollama[1568]: time=2024-12-15T08:23:35.414-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:23:35 launchpad ollama[1568]: time=2024-12-15T08:23:35.414-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:23:35 launchpad ollama[1568]: time=2024-12-15T08:23:35.651-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 15 08:27:09 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 15 08:27:09 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 15 08:27:09 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 15 08:27:09 launchpad systemd[1]: ollama.service: Consumed 3.431s CPU time, 790.8M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot 7f13a190f66242fd900f6239a6cfd33c --
+Dec 15 08:27:42 launchpad systemd[1]: Starting Server for local large language models...
+Dec 15 08:27:42 launchpad systemd[1]: Started Server for local large language models.
+Dec 15 08:27:42 launchpad ollama[1572]: 2024/12/15 08:27:42 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 15 08:27:42 launchpad ollama[1572]: time=2024-12-15T08:27:42.977-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 15 08:27:42 launchpad ollama[1572]: time=2024-12-15T08:27:42.981-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 15 08:27:42 launchpad ollama[1572]: time=2024-12-15T08:27:42.982-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 15 08:27:42 launchpad ollama[1572]: time=2024-12-15T08:27:42.984-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4062981542/runners
+Dec 15 08:27:45 launchpad ollama[1572]: time=2024-12-15T08:27:45.922-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 15 08:27:45 launchpad ollama[1572]: time=2024-12-15T08:27:45.922-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 15 08:27:45 launchpad ollama[1572]: time=2024-12-15T08:27:45.922-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:27:45 launchpad ollama[1572]: time=2024-12-15T08:27:45.922-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:27:45 launchpad ollama[1572]: time=2024-12-15T08:27:45.922-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:27:46 launchpad ollama[1572]: time=2024-12-15T08:27:46.152-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 16 08:07:07 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:07:07 | 200 |    1.732896ms |       127.0.0.1 | HEAD     "/"
+Dec 16 08:07:07 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:07:07 | 200 |   25.578353ms |       127.0.0.1 | POST     "/api/show"
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.213-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.420-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="66.6 GiB"
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.421-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.428-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 40921"
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.429-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.429-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.430-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 08:07:07 launchpad ollama[132786]: INFO [main] build info | build=0 commit="unknown" tid="139941389623296" timestamp=1734365227
+Dec 16 08:07:07 launchpad ollama[132786]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139941389623296" timestamp=1734365227 total_threads=16
+Dec 16 08:07:07 launchpad ollama[132786]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40921" tid="139941389623296" timestamp=1734365227
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 08:07:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 08:07:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 08:07:07 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 08:07:07 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.681-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 08:07:15 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 08:07:15 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 08:07:15 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 08:07:15 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 08:07:15 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 08:07:15 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 08:07:16 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 08:07:16 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 08:07:16 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 08:07:16 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 08:07:16 launchpad ollama[132786]: INFO [main] model loaded | tid="139941389623296" timestamp=1734365236
+Dec 16 08:07:16 launchpad ollama[1572]: time=2024-12-16T08:07:16.211-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.78 seconds"
+Dec 16 08:07:16 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:07:16 | 200 |  9.003896893s |       127.0.0.1 | POST     "/api/generate"
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.839-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.981-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="66.6 GiB"
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.981-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.983-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 37961"
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.983-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.983-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.983-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 08:16:07 launchpad ollama[133209]: INFO [main] build info | build=0 commit="unknown" tid="140040237510656" timestamp=1734365767
+Dec 16 08:16:07 launchpad ollama[133209]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140040237510656" timestamp=1734365767 total_threads=16
+Dec 16 08:16:07 launchpad ollama[133209]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37961" tid="140040237510656" timestamp=1734365767
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 08:16:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 08:16:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 08:16:07 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 08:16:07 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: time=2024-12-16T08:16:07.305-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 08:16:07 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 08:16:08 launchpad ollama[133209]: INFO [main] model loaded | tid="140040237510656" timestamp=1734365768
+Dec 16 08:16:08 launchpad ollama[1572]: time=2024-12-16T08:16:08.308-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.33 seconds"
+Dec 16 08:16:38 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:16:38 | 200 |  31.35051265s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.097-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.243-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="66.6 GiB"
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.244-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.245-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 41903"
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.245-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.245-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.246-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 08:21:44 launchpad ollama[140970]: INFO [main] build info | build=0 commit="unknown" tid="140577512665088" timestamp=1734366104
+Dec 16 08:21:44 launchpad ollama[140970]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140577512665088" timestamp=1734366104 total_threads=16
+Dec 16 08:21:44 launchpad ollama[140970]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41903" tid="140577512665088" timestamp=1734366104
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 08:21:44 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 08:21:44 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 08:21:44 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 08:21:44 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.561-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 08:21:45 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 08:21:45 launchpad ollama[140970]: INFO [main] model loaded | tid="140577512665088" timestamp=1734366105
+Dec 16 08:21:45 launchpad ollama[1572]: time=2024-12-16T08:21:45.567-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 08:22:16 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:22:16 | 200 | 31.955048728s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 08:26:25 launchpad ollama[1572]: time=2024-12-16T08:26:25.556-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 08:27:12 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:27:12 | 200 | 46.798941573s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 08:28:49 launchpad ollama[1572]: time=2024-12-16T08:28:49.020-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 08:28:55 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:28:55 | 200 |  6.599727412s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 13:32:21 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:32:21 | 200 |      15.818µs |       127.0.0.1 | HEAD     "/"
+Dec 16 13:32:21 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:32:21 | 200 |    3.293818ms |       127.0.0.1 | POST     "/api/show"
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.035-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.178-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.6 GiB" free_swap="66.9 GiB"
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.178-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.180-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 42837"
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.180-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.180-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.181-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 13:32:21 launchpad ollama[233429]: INFO [main] build info | build=0 commit="unknown" tid="140314879602688" timestamp=1734384741
+Dec 16 13:32:21 launchpad ollama[233429]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140314879602688" timestamp=1734384741 total_threads=16
+Dec 16 13:32:21 launchpad ollama[233429]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42837" tid="140314879602688" timestamp=1734384741
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 13:32:21 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 13:32:21 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 13:32:21 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 13:32:21 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.494-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 13:32:22 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 13:32:22 launchpad ollama[233429]: INFO [main] model loaded | tid="140314879602688" timestamp=1734384742
+Dec 16 13:32:22 launchpad ollama[1572]: time=2024-12-16T13:32:22.497-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 13:32:22 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:32:22 | 200 |  1.465909497s |       127.0.0.1 | POST     "/api/generate"
+Dec 16 13:33:03 launchpad ollama[1572]: time=2024-12-16T13:33:03.838-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 13:33:22 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:33:22 | 200 | 18.586175577s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 13:35:13 launchpad ollama[1572]: time=2024-12-16T13:35:13.351-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 13:35:29 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:35:29 | 200 | 16.096727432s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 13:38:04 launchpad ollama[1572]: time=2024-12-16T13:38:04.986-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 13:38:21 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:38:21 | 200 | 16.795888904s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 13:39:04 launchpad ollama[1572]: time=2024-12-16T13:39:04.636-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 13:39:07 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:39:07 | 200 |  2.562633713s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 16:46:07 launchpad ollama[1572]: [GIN] 2024/12/16 - 16:46:07 | 200 |      16.015µs |       127.0.0.1 | HEAD     "/"
+Dec 16 16:46:07 launchpad ollama[1572]: [GIN] 2024/12/16 - 16:46:07 | 200 |    3.401384ms |       127.0.0.1 | POST     "/api/show"
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.605-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.740-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="66.9 GiB"
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.740-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.742-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 34999"
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.742-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.742-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.742-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 16:46:07 launchpad ollama[255053]: INFO [main] build info | build=0 commit="unknown" tid="140242733588480" timestamp=1734396367
+Dec 16 16:46:07 launchpad ollama[255053]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140242733588480" timestamp=1734396367 total_threads=16
+Dec 16 16:46:07 launchpad ollama[255053]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34999" tid="140242733588480" timestamp=1734396367
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 16:46:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 16:46:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 16:46:07 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 16:46:07 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: time=2024-12-16T16:46:08.057-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 16:46:08 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 16:46:08 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 16:46:08 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 16:46:08 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 16:46:08 launchpad ollama[255053]: INFO [main] model loaded | tid="140242733588480" timestamp=1734396368
+Dec 16 16:46:09 launchpad ollama[1572]: time=2024-12-16T16:46:09.060-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 16:46:09 launchpad ollama[1572]: [GIN] 2024/12/16 - 16:46:09 | 200 |  1.457804183s |       127.0.0.1 | POST     "/api/generate"
+Dec 16 16:47:28 launchpad ollama[1572]: time=2024-12-16T16:47:28.096-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 16:47:43 launchpad ollama[1572]: [GIN] 2024/12/16 - 16:47:43 | 200 | 15.256738627s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 16:49:14 launchpad ollama[1572]: time=2024-12-16T16:49:14.434-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 16:49:30 launchpad ollama[1572]: [GIN] 2024/12/16 - 16:49:30 | 200 | 16.045843727s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.545-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.691-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.7 GiB" free_swap="66.9 GiB"
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.692-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.693-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 43583"
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.694-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.694-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.694-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 16:58:43 launchpad ollama[263521]: INFO [main] build info | build=0 commit="unknown" tid="140208758382592" timestamp=1734397123
+Dec 16 16:58:43 launchpad ollama[263521]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140208758382592" timestamp=1734397123 total_threads=16
+Dec 16 16:58:43 launchpad ollama[263521]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43583" tid="140208758382592" timestamp=1734397123
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 16:58:43 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 16:58:43 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 16:58:43 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 16:58:43 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: time=2024-12-16T16:58:44.004-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 16:58:44 launchpad ollama[1572]: llm_load_tensors: offloading 37 repeating layers to GPU
+Dec 16 16:58:44 launchpad ollama[1572]: llm_load_tensors: offloaded 37/41 layers to GPU
+Dec 16 16:58:44 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 16:58:44 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 37
+Dec 16 16:58:44 launchpad ollama[263521]: INFO [main] model loaded | tid="140208758382592" timestamp=1734397124
+Dec 16 16:58:45 launchpad ollama[1572]: time=2024-12-16T16:58:45.008-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Dec 16 16:58:59 launchpad ollama[1572]: [GIN] 2024/12/16 - 16:58:59 | 200 | 16.429269815s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.221-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.371-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.1 GiB" free_swap="66.9 GiB"
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.371-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.372-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 39603"
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.372-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.372-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.372-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 17:18:30 launchpad ollama[267913]: INFO [main] build info | build=0 commit="unknown" tid="140615269732352" timestamp=1734398310
+Dec 16 17:18:30 launchpad ollama[267913]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140615269732352" timestamp=1734398310 total_threads=16
+Dec 16 17:18:30 launchpad ollama[267913]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39603" tid="140615269732352" timestamp=1734398310
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 17:18:30 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 17:18:30 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 17:18:30 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 17:18:30 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.690-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_tensors: offloading 37 repeating layers to GPU
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_tensors: offloaded 37/41 layers to GPU
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 17:18:31 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 37
+Dec 16 17:18:31 launchpad ollama[267913]: INFO [main] model loaded | tid="140615269732352" timestamp=1734398311
+Dec 16 17:18:31 launchpad ollama[1572]: time=2024-12-16T17:18:31.694-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 17:18:46 launchpad ollama[1572]: [GIN] 2024/12/16 - 17:18:46 | 200 | 16.255247256s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 17:21:03 launchpad ollama[1572]: time=2024-12-16T17:21:03.361-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 17:21:19 launchpad ollama[1572]: [GIN] 2024/12/16 - 17:21:19 | 200 | 16.126206061s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.131-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.280-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.1 GiB" free_swap="66.9 GiB"
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.280-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.281-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 46803"
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.282-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.282-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.282-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 17:30:32 launchpad ollama[275350]: INFO [main] build info | build=0 commit="unknown" tid="139700488409088" timestamp=1734399032
+Dec 16 17:30:32 launchpad ollama[275350]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139700488409088" timestamp=1734399032 total_threads=16
+Dec 16 17:30:32 launchpad ollama[275350]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46803" tid="139700488409088" timestamp=1734399032
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 17:30:32 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 17:30:32 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 17:30:32 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 17:30:32 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.598-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 17:30:33 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 17:30:33 launchpad ollama[275350]: INFO [main] model loaded | tid="139700488409088" timestamp=1734399033
+Dec 16 17:30:33 launchpad ollama[1572]: time=2024-12-16T17:30:33.601-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 17:30:55 launchpad ollama[1572]: [GIN] 2024/12/16 - 17:30:55 | 200 |  23.71460947s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.789-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.941-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.1 GiB" free_swap="66.9 GiB"
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.941-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.942-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 35353"
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.943-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.943-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.943-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 17:37:58 launchpad ollama[279987]: INFO [main] build info | build=0 commit="unknown" tid="140131594788864" timestamp=1734399478
+Dec 16 17:37:58 launchpad ollama[279987]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140131594788864" timestamp=1734399478 total_threads=16
+Dec 16 17:37:58 launchpad ollama[279987]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35353" tid="140131594788864" timestamp=1734399478
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 17:37:58 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 17:37:58 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 17:37:58 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 17:37:58 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 17:37:59 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: time=2024-12-16T17:37:59.260-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 17:37:59 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 17:37:59 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 17:37:59 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 17:37:59 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 17:38:00 launchpad ollama[279987]: INFO [main] model loaded | tid="140131594788864" timestamp=1734399480
+Dec 16 17:38:00 launchpad ollama[1572]: time=2024-12-16T17:38:00.266-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 17:38:09 launchpad ollama[1572]: [GIN] 2024/12/16 - 17:38:09 | 200 | 11.084683925s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 17:40:26 launchpad ollama[1572]: time=2024-12-16T17:40:26.529-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 17:40:45 launchpad ollama[1572]: [GIN] 2024/12/16 - 17:40:45 | 200 | 18.796467048s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 18:13:44 launchpad ollama[1572]: time=2024-12-16T18:13:44.876-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.025-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="66.9 GiB"
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.026-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.027-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 36911"
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.027-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.027-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.027-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 18:13:45 launchpad ollama[286600]: INFO [main] build info | build=0 commit="unknown" tid="139918625984512" timestamp=1734401625
+Dec 16 18:13:45 launchpad ollama[286600]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139918625984512" timestamp=1734401625 total_threads=16
+Dec 16 18:13:45 launchpad ollama[286600]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36911" tid="139918625984512" timestamp=1734401625
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 18:13:45 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 18:13:45 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 18:13:45 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 18:13:45 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.341-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 16 18:13:45 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 18:13:45 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 18:13:45 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 18:13:45 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 18:13:45 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 18:13:46 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 16 18:13:46 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 16 18:13:46 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 18:13:46 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 18:13:46 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 18:13:46 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 18:13:46 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 18:13:46 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 26
+Dec 16 18:13:46 launchpad ollama[286600]: INFO [main] model loaded | tid="139918625984512" timestamp=1734401626
+Dec 16 18:13:46 launchpad ollama[1572]: time=2024-12-16T18:13:46.345-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 18:14:06 launchpad ollama[1572]: [GIN] 2024/12/16 - 18:14:06 | 200 | 21.318673353s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 18:14:27 launchpad ollama[1572]: time=2024-12-16T18:14:27.247-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 18:14:38 launchpad ollama[1572]: [GIN] 2024/12/16 - 18:14:38 | 200 | 11.437179683s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.573-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.727-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.4 GiB" free_swap="67.0 GiB"
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.727-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.728-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 34391"
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.728-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.728-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.729-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 17 07:49:37 launchpad ollama[299296]: INFO [main] build info | build=0 commit="unknown" tid="140325011386368" timestamp=1734450577
+Dec 17 07:49:37 launchpad ollama[299296]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140325011386368" timestamp=1734450577 total_threads=16
+Dec 17 07:49:37 launchpad ollama[299296]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34391" tid="140325011386368" timestamp=1734450577
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 17 07:49:37 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 17 07:49:37 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 17 07:49:37 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 17 07:49:37 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: time=2024-12-17T07:49:38.044-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 17 07:49:38 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 17 07:49:38 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 17 07:49:38 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 17 07:49:38 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 17 07:49:38 launchpad ollama[299296]: INFO [main] model loaded | tid="140325011386368" timestamp=1734450578
+Dec 17 07:49:39 launchpad ollama[1572]: time=2024-12-17T07:49:39.047-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 17 07:50:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 07:50:12 | 200 | 34.923115772s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 07:51:37 launchpad ollama[1572]: time=2024-12-17T07:51:37.603-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 07:51:53 launchpad ollama[1572]: [GIN] 2024/12/17 - 07:51:53 | 200 | 16.138929914s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 07:52:55 launchpad ollama[1572]: time=2024-12-17T07:52:55.236-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 07:53:31 launchpad ollama[1572]: [GIN] 2024/12/17 - 07:53:31 | 200 | 35.837838754s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 07:55:48 launchpad ollama[1572]: time=2024-12-17T07:55:48.658-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 07:56:04 launchpad ollama[1572]: [GIN] 2024/12/17 - 07:56:04 | 200 | 15.734550328s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 07:59:28 launchpad ollama[1572]: time=2024-12-17T07:59:28.613-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 07:59:40 launchpad ollama[1572]: [GIN] 2024/12/17 - 07:59:40 | 200 | 11.434446802s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:20:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:20:34 | 200 |      15.603µs |       127.0.0.1 | HEAD     "/"
+Dec 17 11:20:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:20:34 | 200 |    5.159978ms |       127.0.0.1 | GET      "/api/tags"
+Dec 17 11:25:09 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:25:09 | 404 |     112.388µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:27:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:27:30 | 404 |     201.257µs |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:28:22 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:28:22 | 404 |     112.055µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:28:45 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:28:45 | 404 |     143.418µs |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:29:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:29:30 | 404 |      94.585µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:29:47 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:29:47 | 404 |     148.802µs |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:32:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:32:12 | 404 |     146.222µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:33:02 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:33:02 | 404 |      80.916µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:34:02 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:34:02 | 200 |      16.415µs |       127.0.0.1 | HEAD     "/"
+Dec 17 11:34:02 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:34:02 | 200 |    1.756919ms |       127.0.0.1 | GET      "/api/tags"
+Dec 17 11:34:35 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:34:35 | 404 |      84.811µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:34:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:34:49 | 200 |    3.470226ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.186-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.346-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.9 GiB" free_swap="67.0 GiB"
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.346-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.348-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 41259"
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.348-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.348-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.348-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 17 11:35:07 launchpad ollama[861591]: INFO [main] build info | build=0 commit="unknown" tid="140081273618432" timestamp=1734464107
+Dec 17 11:35:07 launchpad ollama[861591]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140081273618432" timestamp=1734464107 total_threads=16
+Dec 17 11:35:07 launchpad ollama[861591]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41259" tid="140081273618432" timestamp=1734464107
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 17 11:35:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 17 11:35:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 17 11:35:07 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 17 11:35:07 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.671-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_tensors: offloading 37 repeating layers to GPU
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_tensors: offloaded 37/41 layers to GPU
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 17 11:35:08 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 37
+Dec 17 11:35:08 launchpad ollama[861591]: INFO [main] model loaded | tid="140081273618432" timestamp=1734464108
+Dec 17 11:35:08 launchpad ollama[1572]: time=2024-12-17T11:35:08.675-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.33 seconds"
+Dec 17 11:35:08 launchpad ollama[861591]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1770 n_keep=4 n_left=2044 n_shift=1022 tid="140081273618432" timestamp=1734464108
+Dec 17 11:35:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:35:54 | 200 | 47.721660674s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:35:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:35:54 | 200 |    3.652541ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:35:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:35:54 | 200 |    3.506657ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:35:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:35:54 | 200 |    3.281126ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:35:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:35:54 | 200 |     3.26924ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:37:25 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:37:25 | 200 |    5.516592ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:38:26 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:38:26 | 200 |    5.230736ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:38:26 launchpad ollama[1572]: time=2024-12-17T11:38:26.276-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:39:19 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:39:19 | 200 | 53.267199828s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:39:19 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:39:19 | 200 |    3.515037ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:39:19 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:39:19 | 200 |    3.598043ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:39:19 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:39:19 | 200 |    3.475272ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:39:19 launchpad ollama[1572]: time=2024-12-17T11:39:19.558-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:39:20 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:39:20 | 200 |  1.191025203s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:39:25 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:39:25 | 200 |     3.73684ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:39:25 launchpad ollama[1572]: time=2024-12-17T11:39:25.729-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:40:06 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:06 | 200 | 40.467856449s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:40:06 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:06 | 200 |    3.527293ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:06 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:06 | 200 |    3.659195ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:06 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:06 | 200 |    3.675467ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:06 launchpad ollama[1572]: time=2024-12-17T11:40:06.211-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:40:09 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:09 | 200 |  3.138858564s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:40:09 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:09 | 200 |    3.384973ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:09 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:09 | 200 |    3.268885ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:09 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:09 | 200 |    3.559398ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:09 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:09 | 200 |    3.132726ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:29 launchpad ollama[1572]: time=2024-12-17T11:40:29.191-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:40:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:30 | 200 |  1.366045992s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:40:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:30 | 200 |    3.336638ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:30 | 200 |    4.127728ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:30 | 200 |    3.251137ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:30 | 200 |    3.557758ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:41:08 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:41:08 | 200 |    3.341518ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:41:08 launchpad ollama[1572]: time=2024-12-17T11:41:08.635-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:41:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:41:49 | 200 | 40.868745177s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:41:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:41:49 | 200 |    3.604762ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:41:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:41:49 | 200 |    3.673155ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:41:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:41:49 | 200 |    3.075599ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:41:56 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:41:56 | 200 |    3.757316ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:42:06 launchpad ollama[1572]: time=2024-12-17T11:42:06.494-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:42:10 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:42:10 | 200 |  4.067280721s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:42:10 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:42:10 | 200 |    3.465717ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:42:10 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:42:10 | 200 |    3.240047ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:42:10 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:42:10 | 200 |    3.442517ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:42:10 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:42:10 | 200 |    3.204584ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:42:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:42:49 | 200 |    3.576072ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:42:49 launchpad ollama[1572]: time=2024-12-17T11:42:49.492-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:43:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:43:30 | 200 | 41.241049981s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:43:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:43:30 | 200 |    3.347869ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:43:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:43:30 | 200 |    3.544009ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:43:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:43:30 | 200 |    3.414563ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:43:30 launchpad ollama[1572]: time=2024-12-17T11:43:30.749-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:43:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:43:34 | 200 |  3.582464741s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:43:50 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:43:50 | 200 |    3.480633ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:43:50 launchpad ollama[1572]: time=2024-12-17T11:43:50.733-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:44:32 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:32 | 200 | 41.271786737s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:44:32 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:32 | 200 |    3.624681ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:32 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:32 | 200 |     3.67793ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:32 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:32 | 200 |    3.188362ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:39 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:39 | 200 |    3.511368ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:39 launchpad ollama[1572]: time=2024-12-17T11:44:39.347-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:44:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:54 | 200 | 15.438769121s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:44:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:54 | 200 |    3.592361ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:54 | 200 |    3.670274ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:54 | 200 |    3.345963ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:54 launchpad ollama[1572]: time=2024-12-17T11:44:54.800-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:45:00 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:00 | 200 |  5.481096374s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:45:00 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:00 | 200 |    3.271702ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:00 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:00 | 200 |    3.708494ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:00 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:00 | 200 |    3.164169ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:00 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:00 | 200 |    3.565407ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:22 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:22 | 200 |    3.165493ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:22 launchpad ollama[1572]: time=2024-12-17T11:45:22.498-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:45:38 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:38 | 200 | 16.016094349s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:45:38 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:38 | 200 |    3.815723ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:38 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:38 | 200 |    4.102251ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:38 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:38 | 200 |    3.555883ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:38 launchpad ollama[1572]: time=2024-12-17T11:45:38.529-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:45:44 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:44 | 200 |  5.701122409s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:45:44 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:44 | 200 |    3.197254ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:44 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:44 | 200 |     4.23866ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:44 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:44 | 200 |    3.227173ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:44 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:44 | 200 |    3.768713ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:46:34 launchpad ollama[1572]: time=2024-12-17T11:46:34.412-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:46:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:46:51 | 200 | 16.986524599s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:46:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:46:51 | 200 |    3.446694ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:46:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:46:51 | 200 |    3.733003ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:46:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:46:51 | 200 |    3.529409ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:46:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:46:51 | 200 |    3.270679ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:17 launchpad ollama[1572]: time=2024-12-17T11:47:17.449-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:47:35 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:35 | 200 | 18.019481659s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:47:35 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:35 | 200 |    3.252731ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:35 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:35 | 200 |    3.452773ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:35 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:35 | 200 |    4.092426ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:35 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:35 | 200 |    3.808774ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:35 launchpad ollama[1572]: time=2024-12-17T11:47:35.550-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:47:55 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:55 | 200 | 20.396254743s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:47:55 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:55 | 200 |    3.635684ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:55 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:55 | 200 |    4.415975ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:55 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:55 | 200 |    3.151721ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:55 launchpad ollama[1572]: time=2024-12-17T11:47:55.961-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:47:59 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:59 | 200 |  3.667295825s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:47:59 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:59 | 200 |    3.299418ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:59 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:59 | 200 |    3.864437ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:59 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:59 | 200 |    3.347656ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:59 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:59 | 200 |     3.92988ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:14 launchpad ollama[1572]: time=2024-12-17T11:48:14.428-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:48:16 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:16 | 200 |  2.063212295s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:48:16 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:16 | 200 |    4.151178ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:16 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:16 | 200 |     4.39643ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:16 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:16 | 200 |    3.542621ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:16 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:16 | 200 |     3.62247ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:36 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:36 | 200 |    3.239243ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:36 launchpad ollama[1572]: time=2024-12-17T11:48:36.715-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:48:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:51 | 200 | 14.353676494s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:48:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:51 | 200 |    3.653713ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:51 | 200 |    3.690094ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:51 | 200 |    3.365176ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:53 launchpad ollama[1572]: time=2024-12-17T11:48:53.271-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:48:56 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:56 | 200 |  3.489522935s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:48:56 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:56 | 200 |    3.386586ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:56 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:56 | 200 |    3.622132ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:56 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:56 | 200 |    3.136189ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:56 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:56 | 200 |    3.611936ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:32 launchpad ollama[1572]: time=2024-12-17T11:49:32.523-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:49:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:34 | 200 |  2.008228468s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:49:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:34 | 200 |    3.377142ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:34 | 200 |    4.045894ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:34 | 200 |    3.352814ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:34 | 200 |    3.276895ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:37 launchpad ollama[1572]: time=2024-12-17T11:49:37.920-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:49:39 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:39 | 200 |  1.999980407s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:49:39 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:39 | 200 |    3.737004ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:39 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:39 | 200 |    3.274788ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:39 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:39 | 200 |    3.274432ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:39 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:39 | 200 |    3.476006ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:40 launchpad ollama[1572]: time=2024-12-17T11:49:40.829-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:49:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:42 | 200 |    1.9710455s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:49:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:42 | 200 |    3.380483ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:42 | 200 |    3.873673ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:42 | 200 |    3.723248ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:42 | 200 |    3.105496ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:43 launchpad ollama[1572]: time=2024-12-17T11:49:43.735-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:49:45 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:45 | 200 |  1.994826533s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:49:45 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:45 | 200 |    3.444324ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:45 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:45 | 200 |    3.758586ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:45 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:45 | 200 |    3.219782ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:45 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:45 | 200 |    3.715591ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:06 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:06 | 200 |    3.307249ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:10 launchpad ollama[1572]: time=2024-12-17T11:50:10.716-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:50:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:12 | 200 |  2.026880775s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:50:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:12 | 200 |    3.921184ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:12 | 200 |    3.474571ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:12 | 200 |    3.714548ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:12 | 200 |      3.2587ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:31 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:31 | 404 |     124.129µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:42 | 404 |     182.392µs |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:50:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:49 | 200 |    3.630645ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:51 launchpad ollama[1572]: time=2024-12-17T11:50:51.656-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:50:53 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:53 | 200 |   2.04381689s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:50:53 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:53 | 200 |    3.437339ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:53 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:53 | 200 |    3.903615ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:53 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:53 | 200 |    3.786296ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:53 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:53 | 200 |    3.416089ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:52:37 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:52:37 | 200 |    3.244939ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:52:39 launchpad ollama[1572]: time=2024-12-17T11:52:39.950-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:52:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:52:42 | 200 |  2.653646714s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:52:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:52:42 | 200 |    4.802332ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:52:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:52:42 | 200 |    3.907449ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:52:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:52:42 | 200 |     3.30092ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:52:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:52:42 | 200 |    3.159054ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:21 launchpad ollama[1572]: time=2024-12-17T11:53:21.476-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:53:24 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:24 | 200 |  2.793217621s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:53:24 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:24 | 200 |    3.277695ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:24 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:24 | 200 |    3.341861ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:24 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:24 | 200 |    3.564028ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:24 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:24 | 200 |    3.194388ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:33 launchpad ollama[1572]: time=2024-12-17T11:53:33.939-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:53:36 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:36 | 200 |  2.801734217s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:53:36 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:36 | 200 |    3.472244ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:36 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:36 | 200 |    4.063404ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:36 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:36 | 200 |    3.232822ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:36 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:36 | 200 |    3.692176ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:54:30 launchpad ollama[1572]: time=2024-12-17T11:54:30.250-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:54:33 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:54:33 | 200 |   2.82432102s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:54:33 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:54:33 | 200 |    3.682781ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:54:33 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:54:33 | 200 |    3.719997ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:54:33 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:54:33 | 200 |     3.65298ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:54:33 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:54:33 | 200 |    3.451804ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:55:01 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:55:01 | 200 |      14.915µs |       127.0.0.1 | HEAD     "/"
+Dec 17 11:55:01 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:55:01 | 200 |     3.19846ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:55:01 launchpad ollama[1572]: time=2024-12-17T11:55:01.163-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:55:01 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:55:01 | 200 |    3.797903ms |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:55:38 launchpad ollama[1572]: time=2024-12-17T11:55:38.153-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:55:41 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:55:41 | 200 |  3.221294985s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:56:17 launchpad ollama[1572]: time=2024-12-17T11:56:17.217-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:56:25 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:56:25 | 200 |  8.634351008s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:56:34 launchpad ollama[1572]: time=2024-12-17T11:56:34.255-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:56:40 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:56:40 | 200 |  6.712967398s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:57:16 launchpad ollama[1572]: time=2024-12-17T11:57:16.185-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:57:28 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:57:28 | 200 | 12.770291401s |       127.0.0.1 | POST     "/api/chat"
+-- Boot ec1135dab11f474ca5286a2fef056c09 --
+Dec 17 18:00:58 launchpad systemd[1]: Starting Server for local large language models...
+Dec 17 18:00:58 launchpad systemd[1]: Started Server for local large language models.
+Dec 17 18:00:58 launchpad ollama[1620]: 2024/12/17 18:00:58 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 17 18:00:58 launchpad ollama[1620]: time=2024-12-17T18:00:58.689-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 17 18:00:58 launchpad ollama[1620]: time=2024-12-17T18:00:58.693-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 17 18:00:58 launchpad ollama[1620]: time=2024-12-17T18:00:58.693-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 17 18:00:58 launchpad ollama[1620]: time=2024-12-17T18:00:58.695-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2831793936/runners
+Dec 17 18:01:01 launchpad ollama[1620]: time=2024-12-17T18:01:01.688-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 17 18:01:01 launchpad ollama[1620]: time=2024-12-17T18:01:01.688-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 17 18:01:01 launchpad ollama[1620]: time=2024-12-17T18:01:01.688-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 17 18:01:01 launchpad ollama[1620]: time=2024-12-17T18:01:01.688-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 17 18:01:01 launchpad ollama[1620]: time=2024-12-17T18:01:01.688-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 17 18:01:01 launchpad ollama[1620]: time=2024-12-17T18:01:01.892-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 18 08:26:17 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:26:17 | 200 |     364.657µs |       127.0.0.1 | HEAD     "/"
+Dec 18 08:26:17 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:26:17 | 200 |    5.992374ms |       127.0.0.1 | POST     "/api/show"
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.561-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.698-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10250158080 required="9.2 GiB"
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.698-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.4 GiB" free_swap="68.9 GiB"
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.698-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.699-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2831793936/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 38309"
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.700-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.700-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.700-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 18 08:26:17 launchpad ollama[7745]: INFO [main] build info | build=0 commit="unknown" tid="140601025531904" timestamp=1734539177
+Dec 18 08:26:17 launchpad ollama[7745]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140601025531904" timestamp=1734539177 total_threads=16
+Dec 18 08:26:17 launchpad ollama[7745]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38309" tid="140601025531904" timestamp=1734539177
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - type  f32:   81 tensors
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - type q4_0:  281 tensors
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - type q6_K:    1 tensors
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_vocab: special tokens cache size = 3
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: format           = GGUF V2
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: arch             = llama
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: vocab type       = SPM
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_vocab          = 32016
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_merges         = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: vocab_only       = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_embd           = 5120
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_layer          = 40
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_head           = 40
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_head_kv        = 40
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_rot            = 128
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_swa            = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_gqa            = 1
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_ff             = 13824
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_expert         = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_expert_used    = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: causal attn      = 1
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: pooling type     = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: rope type        = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: rope scaling     = linear
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: freq_scale_train = 1
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: ssm_d_state      = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: model type       = 13B
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: model ftype      = Q4_0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: model params     = 13.02 B
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: general.name     = codellama
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: BOS token        = 1 ''
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: EOS token        = 2 ''
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: UNK token        = 0 ''
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: max token length = 48
+Dec 18 08:26:17 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 18 08:26:17 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 18 08:26:17 launchpad ollama[1620]: ggml_cuda_init: found 1 CUDA devices:
+Dec 18 08:26:17 launchpad ollama[1620]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.950-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 18 08:26:25 launchpad ollama[1620]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 18 08:26:25 launchpad ollama[1620]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 18 08:26:25 launchpad ollama[1620]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 18 08:26:25 launchpad ollama[1620]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 18 08:26:25 launchpad ollama[1620]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: n_ctx      = 2048
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: n_batch    = 512
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: n_ubatch   = 512
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: flash_attn = 0
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: freq_scale = 1
+Dec 18 08:26:26 launchpad ollama[1620]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: graph nodes  = 1286
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: graph splits = 2
+Dec 18 08:26:26 launchpad ollama[7745]: INFO [main] model loaded | tid="140601025531904" timestamp=1734539186
+Dec 18 08:26:26 launchpad ollama[1620]: time=2024-12-18T08:26:26.480-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.78 seconds"
+Dec 18 08:26:26 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:26:26 | 200 |  8.923147347s |       127.0.0.1 | POST     "/api/generate"
+Dec 18 08:26:45 launchpad ollama[1620]: time=2024-12-18T08:26:45.161-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:26:57 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:26:57 | 200 | 12.317376986s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.204-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.349-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10250158080 required="9.2 GiB"
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.349-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.4 GiB" free_swap="68.9 GiB"
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.349-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.350-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2831793936/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 34313"
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.350-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.350-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.351-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 18 08:33:04 launchpad ollama[7836]: INFO [main] build info | build=0 commit="unknown" tid="140706388664320" timestamp=1734539584
+Dec 18 08:33:04 launchpad ollama[7836]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140706388664320" timestamp=1734539584 total_threads=16
+Dec 18 08:33:04 launchpad ollama[7836]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34313" tid="140706388664320" timestamp=1734539584
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - type  f32:   81 tensors
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - type q4_0:  281 tensors
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - type q6_K:    1 tensors
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_vocab: special tokens cache size = 3
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: format           = GGUF V2
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: arch             = llama
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: vocab type       = SPM
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_vocab          = 32016
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_merges         = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: vocab_only       = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_embd           = 5120
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_layer          = 40
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_head           = 40
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_head_kv        = 40
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_rot            = 128
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_swa            = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_gqa            = 1
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_ff             = 13824
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_expert         = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_expert_used    = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: causal attn      = 1
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: pooling type     = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: rope type        = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: rope scaling     = linear
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: freq_scale_train = 1
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: ssm_d_state      = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: model type       = 13B
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: model ftype      = Q4_0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: model params     = 13.02 B
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: general.name     = codellama
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: BOS token        = 1 ''
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: EOS token        = 2 ''
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: UNK token        = 0 ''
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: max token length = 48
+Dec 18 08:33:04 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 18 08:33:04 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 18 08:33:04 launchpad ollama[1620]: ggml_cuda_init: found 1 CUDA devices:
+Dec 18 08:33:04 launchpad ollama[1620]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.634-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: n_ctx      = 2048
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: n_batch    = 512
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: n_ubatch   = 512
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: flash_attn = 0
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: freq_scale = 1
+Dec 18 08:33:05 launchpad ollama[1620]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: graph nodes  = 1286
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: graph splits = 2
+Dec 18 08:33:05 launchpad ollama[7836]: INFO [main] model loaded | tid="140706388664320" timestamp=1734539585
+Dec 18 08:33:05 launchpad ollama[1620]: time=2024-12-18T08:33:05.637-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 18 08:33:18 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:33:18 | 200 |  14.22103974s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.273-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.419-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9966452736 required="9.2 GiB"
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.419-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.3 GiB" free_swap="68.9 GiB"
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.420-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.421-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2831793936/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 36331"
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.421-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.421-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.421-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 18 08:44:33 launchpad ollama[8201]: INFO [main] build info | build=0 commit="unknown" tid="140122786766848" timestamp=1734540273
+Dec 18 08:44:33 launchpad ollama[8201]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140122786766848" timestamp=1734540273 total_threads=16
+Dec 18 08:44:33 launchpad ollama[8201]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36331" tid="140122786766848" timestamp=1734540273
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - type  f32:   81 tensors
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - type q4_0:  281 tensors
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - type q6_K:    1 tensors
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_vocab: special tokens cache size = 3
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: format           = GGUF V2
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: arch             = llama
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: vocab type       = SPM
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_vocab          = 32016
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_merges         = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: vocab_only       = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_embd           = 5120
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_layer          = 40
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_head           = 40
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_head_kv        = 40
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_rot            = 128
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_swa            = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_gqa            = 1
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_ff             = 13824
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_expert         = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_expert_used    = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: causal attn      = 1
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: pooling type     = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: rope type        = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: rope scaling     = linear
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: freq_scale_train = 1
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: ssm_d_state      = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: model type       = 13B
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: model ftype      = Q4_0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: model params     = 13.02 B
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: general.name     = codellama
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: BOS token        = 1 ''
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: EOS token        = 2 ''
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: UNK token        = 0 ''
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: max token length = 48
+Dec 18 08:44:33 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 18 08:44:33 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 18 08:44:33 launchpad ollama[1620]: ggml_cuda_init: found 1 CUDA devices:
+Dec 18 08:44:33 launchpad ollama[1620]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.709-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: n_ctx      = 2048
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: n_batch    = 512
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: n_ubatch   = 512
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: flash_attn = 0
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: freq_scale = 1
+Dec 18 08:44:34 launchpad ollama[1620]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: graph nodes  = 1286
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: graph splits = 2
+Dec 18 08:44:34 launchpad ollama[8201]: INFO [main] model loaded | tid="140122786766848" timestamp=1734540274
+Dec 18 08:44:34 launchpad ollama[1620]: time=2024-12-18T08:44:34.713-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 18 08:44:47 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:44:47 | 200 | 14.344121562s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 08:45:12 launchpad ollama[1620]: time=2024-12-18T08:45:12.862-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:45:23 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:45:23 | 200 | 10.164273531s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.020-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.164-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9984737280 required="9.2 GiB"
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.164-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.3 GiB" free_swap="68.9 GiB"
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.165-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.166-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2831793936/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 36597"
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.166-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.166-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.166-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 18 08:51:09 launchpad ollama[8399]: INFO [main] build info | build=0 commit="unknown" tid="140087951147008" timestamp=1734540669
+Dec 18 08:51:09 launchpad ollama[8399]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140087951147008" timestamp=1734540669 total_threads=16
+Dec 18 08:51:09 launchpad ollama[8399]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36597" tid="140087951147008" timestamp=1734540669
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - type  f32:   81 tensors
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - type q4_0:  281 tensors
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - type q6_K:    1 tensors
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_vocab: special tokens cache size = 3
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: format           = GGUF V2
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: arch             = llama
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: vocab type       = SPM
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_vocab          = 32016
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_merges         = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: vocab_only       = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_embd           = 5120
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_layer          = 40
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_head           = 40
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_head_kv        = 40
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_rot            = 128
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_swa            = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_gqa            = 1
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_ff             = 13824
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_expert         = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_expert_used    = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: causal attn      = 1
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: pooling type     = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: rope type        = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: rope scaling     = linear
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: freq_scale_train = 1
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: ssm_d_state      = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: model type       = 13B
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: model ftype      = Q4_0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: model params     = 13.02 B
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: general.name     = codellama
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: BOS token        = 1 ''
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: EOS token        = 2 ''
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: UNK token        = 0 ''
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: max token length = 48
+Dec 18 08:51:09 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 18 08:51:09 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 18 08:51:09 launchpad ollama[1620]: ggml_cuda_init: found 1 CUDA devices:
+Dec 18 08:51:09 launchpad ollama[1620]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.462-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: n_ctx      = 2048
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: n_batch    = 512
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: n_ubatch   = 512
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: flash_attn = 0
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: freq_scale = 1
+Dec 18 08:51:10 launchpad ollama[1620]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: graph nodes  = 1286
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: graph splits = 2
+Dec 18 08:51:10 launchpad ollama[8399]: INFO [main] model loaded | tid="140087951147008" timestamp=1734540670
+Dec 18 08:51:10 launchpad ollama[1620]: time=2024-12-18T08:51:10.466-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 18 08:51:29 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:51:29 | 200 | 20.344841658s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 08:54:35 launchpad ollama[1620]: time=2024-12-18T08:54:35.242-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:54:44 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:54:44 | 200 |  8.979218815s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 08:58:06 launchpad ollama[1620]: time=2024-12-18T08:58:06.634-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:58:27 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:58:27 | 200 | 20.973919252s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 09:02:54 launchpad ollama[1620]: time=2024-12-18T09:02:54.017-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 09:03:16 launchpad ollama[1620]: [GIN] 2024/12/18 - 09:03:16 | 200 | 22.938191365s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 09:04:24 launchpad ollama[1620]: time=2024-12-18T09:04:24.837-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 09:04:48 launchpad ollama[1620]: [GIN] 2024/12/18 - 09:04:48 | 200 | 23.184344856s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 09:09:11 launchpad ollama[1620]: time=2024-12-18T09:09:11.966-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 09:09:30 launchpad ollama[1620]: [GIN] 2024/12/18 - 09:09:30 | 200 | 18.907398692s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 09:11:49 launchpad ollama[1620]: time=2024-12-18T09:11:49.132-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 09:12:06 launchpad ollama[1620]: [GIN] 2024/12/18 - 09:12:06 | 200 | 17.640043768s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 09:14:31 launchpad ollama[1620]: time=2024-12-18T09:14:31.413-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 09:14:44 launchpad ollama[1620]: [GIN] 2024/12/18 - 09:14:44 | 200 | 12.874644777s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 09:16:40 launchpad ollama[1620]: time=2024-12-18T09:16:40.421-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 09:16:46 launchpad ollama[1620]: [GIN] 2024/12/18 - 09:16:46 | 200 |  6.071586475s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.105-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.251-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.8 GiB" free_swap="68.9 GiB"
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.252-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.253-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2831793936/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 33937"
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.253-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.253-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.253-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 18 13:16:12 launchpad ollama[12110]: INFO [main] build info | build=0 commit="unknown" tid="139748038270976" timestamp=1734556572
+Dec 18 13:16:12 launchpad ollama[12110]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139748038270976" timestamp=1734556572 total_threads=16
+Dec 18 13:16:12 launchpad ollama[12110]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33937" tid="139748038270976" timestamp=1734556572
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - type  f32:   81 tensors
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - type q4_0:  281 tensors
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - type q6_K:    1 tensors
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_vocab: special tokens cache size = 3
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: format           = GGUF V2
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: arch             = llama
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: vocab type       = SPM
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_vocab          = 32016
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_merges         = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: vocab_only       = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_embd           = 5120
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_layer          = 40
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_head           = 40
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_head_kv        = 40
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_rot            = 128
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_swa            = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_gqa            = 1
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_ff             = 13824
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_expert         = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_expert_used    = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: causal attn      = 1
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: pooling type     = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: rope type        = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: rope scaling     = linear
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: freq_scale_train = 1
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: ssm_d_state      = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: model type       = 13B
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: model ftype      = Q4_0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: model params     = 13.02 B
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: general.name     = codellama
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: BOS token        = 1 ''
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: EOS token        = 2 ''
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: UNK token        = 0 ''
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: max token length = 48
+Dec 18 13:16:12 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 18 13:16:12 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 18 13:16:12 launchpad ollama[1620]: ggml_cuda_init: found 1 CUDA devices:
+Dec 18 13:16:12 launchpad ollama[1620]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.543-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: n_ctx      = 2048
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: n_batch    = 512
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: n_ubatch   = 512
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: flash_attn = 0
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: freq_scale = 1
+Dec 18 13:16:13 launchpad ollama[1620]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: graph nodes  = 1286
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: graph splits = 26
+Dec 18 13:16:13 launchpad ollama[12110]: INFO [main] model loaded | tid="139748038270976" timestamp=1734556573
+Dec 18 13:16:13 launchpad ollama[1620]: time=2024-12-18T13:16:13.547-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 18 13:16:24 launchpad ollama[1620]: [GIN] 2024/12/18 - 13:16:24 | 200 | 12.052254895s |       127.0.0.1 | POST     "/api/chat"
+Dec 22 18:30:50 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 22 18:30:52 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 22 18:30:52 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 22 18:30:52 launchpad systemd[1]: ollama.service: Consumed 4min 24.320s CPU time, 8.1G memory peak, 7.1G read from disk, 508.1M written to disk.
+-- Boot e41844aa9c074591b27e1c5a25e18d3c --
+Dec 23 07:45:09 launchpad systemd[1]: Starting Server for local large language models...
+Dec 23 07:45:09 launchpad systemd[1]: Started Server for local large language models.
+Dec 23 07:45:09 launchpad ollama[1579]: 2024/12/23 07:45:09 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 23 07:45:09 launchpad ollama[1579]: time=2024-12-23T07:45:09.464-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 23 07:45:09 launchpad ollama[1579]: time=2024-12-23T07:45:09.468-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 23 07:45:09 launchpad ollama[1579]: time=2024-12-23T07:45:09.469-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 23 07:45:09 launchpad ollama[1579]: time=2024-12-23T07:45:09.471-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2172411424/runners
+Dec 23 07:45:12 launchpad ollama[1579]: time=2024-12-23T07:45:12.466-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 23 07:45:12 launchpad ollama[1579]: time=2024-12-23T07:45:12.467-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 23 07:45:12 launchpad ollama[1579]: time=2024-12-23T07:45:12.467-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 23 07:45:12 launchpad ollama[1579]: time=2024-12-23T07:45:12.467-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 23 07:45:12 launchpad ollama[1579]: time=2024-12-23T07:45:12.467-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 23 07:45:12 launchpad ollama[1579]: time=2024-12-23T07:45:12.676-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 26 08:40:01 launchpad ollama[1579]: [GIN] 2024/12/26 - 08:40:01 | 200 |     318.329µs |       127.0.0.1 | HEAD     "/"
+Dec 26 08:40:01 launchpad ollama[1579]: [GIN] 2024/12/26 - 08:40:01 | 200 |   18.324665ms |       127.0.0.1 | POST     "/api/show"
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.674-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9601089536 required="6.2 GiB"
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.674-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.5 GiB" free_swap="68.9 GiB"
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.674-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.676-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2172411424/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39893"
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.676-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.676-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.676-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 26 08:40:01 launchpad ollama[25550]: INFO [main] build info | build=0 commit="unknown" tid="140213899902976" timestamp=1735231201
+Dec 26 08:40:01 launchpad ollama[25550]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140213899902976" timestamp=1735231201 total_threads=16
+Dec 26 08:40:01 launchpad ollama[25550]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39893" tid="140213899902976" timestamp=1735231201
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.928-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - type  f32:   65 tensors
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - type q4_0:  225 tensors
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - type q6_K:    1 tensors
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_vocab: special tokens cache size = 256
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: arch             = llama
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: vocab type       = BPE
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_vocab          = 128256
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_merges         = 280147
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: vocab_only       = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_embd           = 4096
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_layer          = 32
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_head           = 32
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_head_kv        = 8
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_rot            = 128
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_swa            = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_gqa            = 4
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_ff             = 14336
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_expert         = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_expert_used    = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: causal attn      = 1
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: pooling type     = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: rope type        = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: rope scaling     = linear
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: freq_scale_train = 1
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: ssm_d_state      = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: model type       = 8B
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: model ftype      = Q4_0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: model params     = 8.03 B
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: max token length = 256
+Dec 26 08:40:02 launchpad ollama[1579]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 26 08:40:02 launchpad ollama[1579]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 26 08:40:02 launchpad ollama[1579]: ggml_cuda_init: found 1 CUDA devices:
+Dec 26 08:40:02 launchpad ollama[1579]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 26 08:40:06 launchpad ollama[1579]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 26 08:40:06 launchpad ollama[1579]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 26 08:40:06 launchpad ollama[1579]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 26 08:40:06 launchpad ollama[1579]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 26 08:40:06 launchpad ollama[1579]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: n_ctx      = 8192
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: n_batch    = 512
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: n_ubatch   = 512
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: flash_attn = 0
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: freq_scale = 1
+Dec 26 08:40:07 launchpad ollama[1579]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: graph nodes  = 1030
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: graph splits = 2
+Dec 26 08:40:07 launchpad ollama[25550]: INFO [main] model loaded | tid="140213899902976" timestamp=1735231207
+Dec 26 08:40:07 launchpad ollama[1579]: time=2024-12-26T08:40:07.447-08:00 level=INFO source=server.go:626 msg="llama runner started in 5.77 seconds"
+Dec 26 08:40:07 launchpad ollama[1579]: [GIN] 2024/12/26 - 08:40:07 | 200 |   5.95389214s |       127.0.0.1 | POST     "/api/generate"
+Dec 26 08:44:08 launchpad ollama[1579]: [GIN] 2024/12/26 - 08:44:08 | 200 |  5.773063495s |       127.0.0.1 | POST     "/api/chat"
+Dec 26 08:45:16 launchpad ollama[1579]: [GIN] 2024/12/26 - 08:45:16 | 200 |  3.236343889s |       127.0.0.1 | POST     "/api/chat"
+Dec 26 08:46:04 launchpad ollama[1579]: [GIN] 2024/12/26 - 08:46:04 | 200 |  1.811321635s |       127.0.0.1 | POST     "/api/chat"
+-- Boot 59e4e91c265e46c5b4d70cfcce7e99e2 --
+Dec 26 14:11:39 launchpad systemd[1]: Starting Server for local large language models...
+Dec 26 14:11:39 launchpad systemd[1]: Started Server for local large language models.
+Dec 26 14:11:39 launchpad ollama[1607]: 2024/12/26 14:11:39 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 26 14:11:40 launchpad ollama[1607]: time=2024-12-26T14:11:40.003-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 26 14:11:40 launchpad ollama[1607]: time=2024-12-26T14:11:40.007-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 26 14:11:40 launchpad ollama[1607]: time=2024-12-26T14:11:40.008-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 26 14:11:40 launchpad ollama[1607]: time=2024-12-26T14:11:40.010-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2730878161/runners
+Dec 26 14:11:43 launchpad ollama[1607]: time=2024-12-26T14:11:43.015-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Dec 26 14:11:43 launchpad ollama[1607]: time=2024-12-26T14:11:43.016-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 26 14:11:43 launchpad ollama[1607]: time=2024-12-26T14:11:43.016-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 26 14:11:43 launchpad ollama[1607]: time=2024-12-26T14:11:43.016-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 26 14:11:43 launchpad ollama[1607]: time=2024-12-26T14:11:43.016-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 26 14:11:43 launchpad ollama[1607]: time=2024-12-26T14:11:43.248-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 27 14:38:43 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:38:43 | 404 |     505.897µs |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.875-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8789557248 required="6.2 GiB"
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.875-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.875-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.876-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2730878161/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35155"
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.876-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.876-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.877-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 14:38:51 launchpad ollama[600432]: INFO [main] build info | build=0 commit="unknown" tid="140036355035136" timestamp=1735339131
+Dec 27 14:38:51 launchpad ollama[600432]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140036355035136" timestamp=1735339131 total_threads=16
+Dec 27 14:38:51 launchpad ollama[600432]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35155" tid="140036355035136" timestamp=1735339131
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - type  f32:   65 tensors
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - type q4_0:  225 tensors
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 14:38:51 launchpad ollama[1607]: time=2024-12-27T14:38:51.128-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_vocab: special tokens cache size = 256
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: arch             = llama
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: vocab type       = BPE
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_vocab          = 128256
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_merges         = 280147
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: vocab_only       = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_embd           = 4096
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_layer          = 32
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_head           = 32
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_head_kv        = 8
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_rot            = 128
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_swa            = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_gqa            = 4
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_ff             = 14336
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_expert         = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: causal attn      = 1
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: pooling type     = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: rope type        = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: rope scaling     = linear
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: model type       = 8B
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: model params     = 8.03 B
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: max token length = 256
+Dec 27 14:38:51 launchpad ollama[1607]: ggml_cuda_init: failed to initialize CUDA: forward compatibility was attempted on non supported HW
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_tensors: ggml ctx size =    0.14 MiB
+Dec 27 14:38:56 launchpad ollama[1607]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 27 14:38:56 launchpad ollama[1607]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 14:38:56 launchpad ollama[1607]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 27 14:38:56 launchpad ollama[1607]: llm_load_tensors:        CPU buffer size =  4437.80 MiB
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: n_ctx      = 8192
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: n_batch    = 512
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: flash_attn = 0
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: freq_scale = 1
+Dec 27 14:38:56 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 1024.00 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:38:56 launchpad ollama[1607]: llama_kv_cache_init:        CPU KV buffer size =  1024.00 MiB
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 27 14:38:56 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 2.02 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model:        CPU  output buffer size =     2.02 MiB
+Dec 27 14:38:56 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 560.01 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model:  CUDA_Host compute buffer size =   560.01 MiB
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: graph nodes  = 1030
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: graph splits = 1
+Dec 27 14:38:56 launchpad ollama[600432]: INFO [main] model loaded | tid="140036355035136" timestamp=1735339136
+Dec 27 14:38:56 launchpad ollama[1607]: time=2024-12-27T14:38:56.898-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.02 seconds"
+Dec 27 14:39:52 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:39:52 | 200 |          1m1s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 14:44:57 launchpad ollama[1607]: time=2024-12-27T14:44:57.317-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.125426471 model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa
+Dec 27 14:44:57 launchpad ollama[1607]: time=2024-12-27T14:44:57.566-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.374853636 model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa
+Dec 27 14:44:57 launchpad ollama[1607]: time=2024-12-27T14:44:57.817-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.625394288 model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa
+Dec 27 14:51:17 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:51:17 | 404 |      79.667µs |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.169-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8752005120 required="6.2 GiB"
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.170-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.170-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.171-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2730878161/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44609"
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.171-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.171-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.171-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 14:51:36 launchpad ollama[601091]: INFO [main] build info | build=0 commit="unknown" tid="139846059413504" timestamp=1735339896
+Dec 27 14:51:36 launchpad ollama[601091]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139846059413504" timestamp=1735339896 total_threads=16
+Dec 27 14:51:36 launchpad ollama[601091]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44609" tid="139846059413504" timestamp=1735339896
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - type  f32:   65 tensors
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - type q4_0:  225 tensors
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_vocab: special tokens cache size = 256
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.422-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: arch             = llama
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: vocab type       = BPE
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_vocab          = 128256
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_merges         = 280147
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: vocab_only       = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_embd           = 4096
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_layer          = 32
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_head           = 32
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_head_kv        = 8
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_rot            = 128
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_swa            = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_gqa            = 4
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_ff             = 14336
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_expert         = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: causal attn      = 1
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: pooling type     = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: rope type        = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: rope scaling     = linear
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: model type       = 8B
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: model params     = 8.03 B
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: max token length = 256
+Dec 27 14:51:36 launchpad ollama[1607]: ggml_cuda_init: failed to initialize CUDA: forward compatibility was attempted on non supported HW
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_tensors: ggml ctx size =    0.14 MiB
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_tensors:        CPU buffer size =  4437.80 MiB
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: n_ctx      = 8192
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: n_batch    = 512
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: flash_attn = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: freq_scale = 1
+Dec 27 14:51:36 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 1024.00 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:51:36 launchpad ollama[1607]: llama_kv_cache_init:        CPU KV buffer size =  1024.00 MiB
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 27 14:51:36 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 2.02 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model:        CPU  output buffer size =     2.02 MiB
+Dec 27 14:51:36 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 560.01 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model:  CUDA_Host compute buffer size =   560.01 MiB
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: graph nodes  = 1030
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: graph splits = 1
+Dec 27 14:51:37 launchpad ollama[601091]: INFO [main] model loaded | tid="139846059413504" timestamp=1735339897
+Dec 27 14:51:37 launchpad ollama[1607]: time=2024-12-27T14:51:37.175-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 27 14:53:31 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:53:31 | 200 |         1m55s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 14:53:31 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:53:31 | 200 |   12.793602ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:53:31 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:53:31 | 200 |   12.590232ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:54:42 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:54:42 | 200 | 28.286425244s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 14:54:42 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:54:42 | 200 |   12.636286ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:54:42 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:54:42 | 200 |   13.119285ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:57:10 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:57:10 | 404 |      67.309µs |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:58:07 launchpad ollama[1607]: time=2024-12-27T14:58:07.619-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 14:58:07 launchpad ollama[1607]: time=2024-12-27T14:58:07.759-08:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="5.4 GiB"
+Dec 27 14:58:12 launchpad ollama[1607]: time=2024-12-27T14:58:12.882-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.121904144 model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.131-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.371374687 model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.138-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.138-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=35 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.140-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2730878161/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 35 --parallel 1 --port 45005"
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.140-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.140-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.140-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 14:58:13 launchpad ollama[602803]: INFO [main] build info | build=0 commit="unknown" tid="140670149808128" timestamp=1735340293
+Dec 27 14:58:13 launchpad ollama[602803]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140670149808128" timestamp=1735340293 total_threads=16
+Dec 27 14:58:13 launchpad ollama[602803]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45005" tid="140670149808128" timestamp=1735340293
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - type  f32:   81 tensors
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - type q4_0:  281 tensors
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_vocab: special tokens cache size = 3
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: format           = GGUF V2
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: arch             = llama
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: vocab type       = SPM
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_vocab          = 32016
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_merges         = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: vocab_only       = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_embd           = 5120
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_layer          = 40
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_head           = 40
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_head_kv        = 40
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_rot            = 128
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_swa            = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_gqa            = 1
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_ff             = 13824
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_expert         = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: causal attn      = 1
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: pooling type     = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: rope type        = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: rope scaling     = linear
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: model type       = 13B
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: model params     = 13.02 B
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: general.name     = codellama
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: BOS token        = 1 ''
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: EOS token        = 2 ''
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: UNK token        = 0 ''
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: max token length = 48
+Dec 27 14:58:13 launchpad ollama[1607]: ggml_cuda_init: failed to initialize CUDA: forward compatibility was attempted on non supported HW
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_tensors: ggml ctx size =    0.17 MiB
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.382-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.621899773 model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.390-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 14:58:20 launchpad ollama[1607]: llm_load_tensors: offloading 35 repeating layers to GPU
+Dec 27 14:58:20 launchpad ollama[1607]: llm_load_tensors: offloaded 35/41 layers to GPU
+Dec 27 14:58:20 launchpad ollama[1607]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 27 14:58:20 launchpad ollama[1607]: llama_new_context_with_model: n_ctx      = 2048
+Dec 27 14:58:20 launchpad ollama[1607]: llama_new_context_with_model: n_batch    = 512
+Dec 27 14:58:20 launchpad ollama[1607]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 14:58:20 launchpad ollama[1607]: llama_new_context_with_model: flash_attn = 0
+Dec 27 14:58:20 launchpad ollama[1607]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 27 14:58:20 launchpad ollama[1607]: llama_new_context_with_model: freq_scale = 1
+Dec 27 14:58:20 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 1600.00 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:58:21 launchpad ollama[1607]: llama_kv_cache_init:        CPU KV buffer size =  1600.00 MiB
+Dec 27 14:58:21 launchpad ollama[1607]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 27 14:58:21 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 0.14 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:58:21 launchpad ollama[1607]: llama_new_context_with_model:        CPU  output buffer size =     0.14 MiB
+Dec 27 14:58:21 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 204.01 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:58:21 launchpad ollama[1607]: llama_new_context_with_model:  CUDA_Host compute buffer size =   204.01 MiB
+Dec 27 14:58:21 launchpad ollama[1607]: llama_new_context_with_model: graph nodes  = 1286
+Dec 27 14:58:21 launchpad ollama[1607]: llama_new_context_with_model: graph splits = 1
+Dec 27 14:58:21 launchpad ollama[602803]: INFO [main] model loaded | tid="140670149808128" timestamp=1735340301
+Dec 27 14:58:21 launchpad ollama[1607]: time=2024-12-27T14:58:21.675-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.53 seconds"
+Dec 27 14:58:21 launchpad ollama[602803]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=165165 n_keep=4 n_left=2044 n_shift=1022 tid="140670149808128" timestamp=1735340301
+Dec 27 15:00:54 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:00:54 | 200 |         2m47s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:00:54 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:00:54 | 200 |    3.849651ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:00:54 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:00:54 | 200 |    3.592242ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:03:20 launchpad ollama[1607]: time=2024-12-27T15:03:20.680-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:03:20 launchpad ollama[602803]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=165341 n_keep=4 n_left=2044 n_shift=1022 tid="140670149808128" timestamp=1735340600
+Dec 27 15:05:12 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:05:12 | 200 |         1m51s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:08:45 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:08:45 | 200 |    1.617502ms |       127.0.0.1 | GET      "/api/tags"
+Dec 27 15:09:44 launchpad ollama[1607]: time=2024-12-27T15:09:44.086-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:11:12 launchpad ollama[1607]: time=2024-12-27T15:11:12.315-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:12:14 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:12:14 | 200 |         2m30s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:14:33 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:14:33 | 200 |         3m21s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:19:38 launchpad ollama[1607]: time=2024-12-27T15:19:38.701-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.133624605 model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c
+Dec 27 15:19:38 launchpad ollama[1607]: time=2024-12-27T15:19:38.951-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.383435707 model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c
+Dec 27 15:19:39 launchpad ollama[1607]: time=2024-12-27T15:19:39.201-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.633218749 model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c
+Dec 27 15:22:58 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 27 15:23:03 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 27 15:23:03 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 27 15:23:03 launchpad systemd[1]: ollama.service: Consumed 1h 46min 7.781s CPU time, 14G memory peak, 11.4G read from disk, 508.1M written to disk.
+-- Boot 44ff52faa26f4443971e9c3d29efe1cd --
+Dec 27 15:23:46 launchpad systemd[1]: Starting Server for local large language models...
+Dec 27 15:23:46 launchpad systemd[1]: Started Server for local large language models.
+Dec 27 15:23:46 launchpad ollama[1503]: 2024/12/27 15:23:46 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 27 15:23:46 launchpad ollama[1503]: time=2024-12-27T15:23:46.207-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 27 15:23:46 launchpad ollama[1503]: time=2024-12-27T15:23:46.212-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 27 15:23:46 launchpad ollama[1503]: time=2024-12-27T15:23:46.213-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 27 15:23:46 launchpad ollama[1503]: time=2024-12-27T15:23:46.214-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2343501151/runners
+Dec 27 15:23:49 launchpad ollama[1503]: time=2024-12-27T15:23:49.125-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Dec 27 15:23:49 launchpad ollama[1503]: time=2024-12-27T15:23:49.126-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 27 15:23:49 launchpad ollama[1503]: time=2024-12-27T15:23:49.126-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 27 15:23:49 launchpad ollama[1503]: time=2024-12-27T15:23:49.126-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 27 15:23:49 launchpad ollama[1503]: time=2024-12-27T15:23:49.126-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 27 15:23:49 launchpad ollama[1503]: time=2024-12-27T15:23:49.357-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 27 15:25:31 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:25:31 | 200 |    2.681026ms |       127.0.0.1 | GET      "/api/tags"
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.790-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.954-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10755571712 required="9.2 GiB"
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.954-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.6 GiB" free_swap="68.9 GiB"
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.955-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[10.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.956-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 33093"
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.956-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.956-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.957-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 15:26:15 launchpad ollama[4157]: INFO [main] build info | build=0 commit="unknown" tid="140345678643200" timestamp=1735341975
+Dec 27 15:26:15 launchpad ollama[4157]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140345678643200" timestamp=1735341975 total_threads=16
+Dec 27 15:26:15 launchpad ollama[4157]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33093" tid="140345678643200" timestamp=1735341975
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - type  f32:   81 tensors
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - type q4_0:  281 tensors
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 3
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V2
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: vocab type       = SPM
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 32016
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 5120
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 40
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_head           = 40
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 40
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 1
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 13824
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: model type       = 13B
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: model params     = 13.02 B
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: general.name     = codellama
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 1 ''
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 2 ''
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: UNK token        = 0 ''
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: max token length = 48
+Dec 27 15:26:15 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 15:26:15 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 15:26:15 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 15:26:15 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 15:26:15 launchpad ollama[1503]: time=2024-12-27T15:26:15.207-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 27 15:26:22 launchpad ollama[1503]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 27 15:26:22 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 15:26:22 launchpad ollama[1503]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 27 15:26:22 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 27 15:26:22 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 2048
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 15:26:23 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1286
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 15:26:23 launchpad ollama[4157]: INFO [main] model loaded | tid="140345678643200" timestamp=1735341983
+Dec 27 15:26:23 launchpad ollama[1503]: time=2024-12-27T15:26:23.733-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.78 seconds"
+Dec 27 15:26:29 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:26:29 | 200 |  14.38802252s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:27:16 launchpad ollama[1503]: time=2024-12-27T15:27:16.658-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:27:31 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:27:31 | 200 | 14.992202305s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:30:08 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:30:08 | 404 |     133.706µs |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:30:19 launchpad ollama[1503]: time=2024-12-27T15:30:19.619-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:30:26 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:30:26 | 200 |  6.441713687s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:30:26 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:30:26 | 200 |    4.374709ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:30:26 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:30:26 | 200 |    3.253192ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:31:16 launchpad ollama[1503]: time=2024-12-27T15:31:16.387-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:31:16 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=165044 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735342276
+Dec 27 15:31:24 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:31:24 | 200 |   8.40532896s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:31:24 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:31:24 | 200 |    3.140186ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:31:24 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:31:24 | 200 |    3.681763ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:35:49 launchpad ollama[1503]: time=2024-12-27T15:35:49.701-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:35:49 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=165433 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735342549
+Dec 27 15:35:59 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:35:59 | 200 |  9.704588023s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:35:59 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:35:59 | 200 |     3.98204ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:35:59 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:35:59 | 200 |    3.572789ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:36:45 launchpad ollama[1503]: time=2024-12-27T15:36:45.921-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:36:46 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=165712 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735342606
+Dec 27 15:36:57 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:36:57 | 200 | 11.632665765s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:36:57 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:36:57 | 200 |    4.151026ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:36:57 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:36:57 | 200 |     3.69001ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:37:08 launchpad ollama[1503]: time=2024-12-27T15:37:08.348-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:37:08 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=166048 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735342628
+Dec 27 15:37:19 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:37:19 | 200 | 11.600473915s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:37:20 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:37:20 | 200 |    3.417516ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:37:20 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:37:20 | 200 |    3.097743ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:38:07 launchpad ollama[1503]: time=2024-12-27T15:38:07.935-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:38:08 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=166391 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735342688
+Dec 27 15:38:17 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:38:17 | 200 |  9.632512934s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:38:17 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:38:17 | 200 |    3.812244ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:38:17 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:38:17 | 200 |    3.174486ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:39:18 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:39:18 | 200 |     737.788µs |       127.0.0.1 | GET      "/api/tags"
+Dec 27 15:39:47 launchpad ollama[1503]: time=2024-12-27T15:39:47.134-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:39:53 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:39:53 | 200 |   6.46578068s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:41:02 launchpad ollama[1503]: time=2024-12-27T15:41:02.440-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:41:09 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:41:09 | 200 |  6.900632764s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:42:19 launchpad ollama[1503]: time=2024-12-27T15:42:19.222-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:42:19 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1933 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735342939
+Dec 27 15:42:30 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:42:30 | 200 | 11.479806001s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:45:28 launchpad ollama[1503]: time=2024-12-27T15:45:28.683-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:45:28 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=2262 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343128
+Dec 27 15:45:38 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:45:38 | 200 |  9.804324867s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:47:14 launchpad ollama[1503]: time=2024-12-27T15:47:14.008-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:47:14 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=2554 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343234
+Dec 27 15:47:25 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:47:25 | 200 | 11.349651911s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:48:15 launchpad ollama[1503]: time=2024-12-27T15:48:15.040-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:48:15 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=2887 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343295
+Dec 27 15:48:25 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:48:25 | 200 | 10.460803877s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:49:06 launchpad ollama[1503]: time=2024-12-27T15:49:06.009-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:49:06 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3193 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343346
+Dec 27 15:49:09 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:49:09 | 200 |  3.493525323s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:49:45 launchpad ollama[1503]: time=2024-12-27T15:49:45.446-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:49:45 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3282 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343385
+Dec 27 15:49:48 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:49:48 | 200 |  3.353399063s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:50:23 launchpad ollama[1503]: time=2024-12-27T15:50:23.898-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:50:23 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3359 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343423
+Dec 27 15:50:32 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:50:32 | 200 |  8.290711857s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:50:45 launchpad ollama[1503]: time=2024-12-27T15:50:45.916-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:50:45 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3598 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343445
+Dec 27 15:50:54 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:50:54 | 200 |  8.171207666s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:54:04 launchpad ollama[1503]: time=2024-12-27T15:54:04.841-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:54:04 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3833 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343644
+Dec 27 15:54:22 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:54:22 | 200 | 17.884059421s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:55:58 launchpad ollama[1503]: time=2024-12-27T15:55:58.184-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:55:58 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=4398 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343758
+Dec 27 15:56:10 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:56:10 | 200 | 12.459036639s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:56:44 launchpad ollama[1503]: time=2024-12-27T15:56:44.814-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:56:44 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=9355 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343804
+Dec 27 15:57:03 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:57:03 | 200 | 18.678989865s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:57:51 launchpad ollama[1503]: time=2024-12-27T15:57:51.645-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:57:51 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=12260 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343871
+Dec 27 15:57:59 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:57:59 | 200 |  7.465078241s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:58:38 launchpad ollama[1503]: time=2024-12-27T15:58:38.522-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:58:38 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=14838 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343918
+Dec 27 15:58:55 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:58:55 | 200 | 17.406273881s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:00:27 launchpad ollama[1503]: time=2024-12-27T16:00:27.846-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:00:27 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=17703 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735344027
+Dec 27 16:00:57 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:00:57 | 200 | 29.523803789s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:01:23 launchpad ollama[1503]: time=2024-12-27T16:01:23.237-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:01:23 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=20029 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735344083
+Dec 27 16:01:27 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:01:27 | 200 |  4.749219744s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.017-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.196-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10401546240 required="9.2 GiB"
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.196-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.197-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.197-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 40899"
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.198-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.198-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.198-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 16:27:44 launchpad ollama[15881]: INFO [main] build info | build=0 commit="unknown" tid="139803143118848" timestamp=1735345664
+Dec 27 16:27:44 launchpad ollama[15881]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139803143118848" timestamp=1735345664 total_threads=16
+Dec 27 16:27:44 launchpad ollama[15881]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40899" tid="139803143118848" timestamp=1735345664
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - type  f32:   81 tensors
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - type q4_0:  281 tensors
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 3
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V2
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: vocab type       = SPM
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 32016
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 5120
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 40
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_head           = 40
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 40
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 1
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 13824
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: model type       = 13B
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: model params     = 13.02 B
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: general.name     = codellama
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 1 ''
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 2 ''
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: UNK token        = 0 ''
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: max token length = 48
+Dec 27 16:27:44 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 16:27:44 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 16:27:44 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 16:27:44 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.475-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 2048
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 16:27:45 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1286
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 16:27:45 launchpad ollama[15881]: INFO [main] model loaded | tid="139803143118848" timestamp=1735345665
+Dec 27 16:27:45 launchpad ollama[1503]: time=2024-12-27T16:27:45.478-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.28 seconds"
+Dec 27 16:27:45 launchpad ollama[15881]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=20362 n_keep=4 n_left=2044 n_shift=1022 tid="139803143118848" timestamp=1735345665
+Dec 27 16:27:56 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:27:56 | 200 | 12.863784421s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:31:37 launchpad ollama[1503]: time=2024-12-27T16:31:37.055-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:31:37 launchpad ollama[15881]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=20942 n_keep=4 n_left=2044 n_shift=1022 tid="139803143118848" timestamp=1735345897
+Dec 27 16:31:46 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:31:46 | 200 |  9.560043699s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:34:24 launchpad ollama[1503]: time=2024-12-27T16:34:24.627-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:34:24 launchpad ollama[15881]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=21379 n_keep=4 n_left=2044 n_shift=1022 tid="139803143118848" timestamp=1735346064
+Dec 27 16:34:33 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:34:33 | 200 |  8.980140604s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:37:51 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:37:51 | 200 |      15.551µs |       127.0.0.1 | HEAD     "/"
+Dec 27 16:37:51 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:37:51 | 200 |   17.029928ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 16:37:51 launchpad ollama[1503]: time=2024-12-27T16:37:51.552-08:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="984.6 MiB"
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.193-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10401611776 required="6.2 GiB"
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.193-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.193-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.194-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42711"
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.194-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.194-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.194-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 16:37:52 launchpad ollama[17188]: INFO [main] build info | build=0 commit="unknown" tid="140120121364480" timestamp=1735346272
+Dec 27 16:37:52 launchpad ollama[17188]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140120121364480" timestamp=1735346272 total_threads=16
+Dec 27 16:37:52 launchpad ollama[17188]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42711" tid="140120121364480" timestamp=1735346272
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - type  f32:   65 tensors
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - type q4_0:  225 tensors
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 256
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.445-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: vocab type       = BPE
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 128256
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 280147
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 4096
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 32
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_head           = 32
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 8
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 4
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 14336
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: model type       = 8B
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: model params     = 8.03 B
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: max token length = 256
+Dec 27 16:37:52 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 16:37:52 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 16:37:52 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 16:37:52 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 27 16:37:57 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 16:37:57 launchpad ollama[1503]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 27 16:37:57 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 8192
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 16:37:57 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1030
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 16:37:57 launchpad ollama[17188]: INFO [main] model loaded | tid="140120121364480" timestamp=1735346277
+Dec 27 16:37:57 launchpad ollama[1503]: time=2024-12-27T16:37:57.961-08:00 level=INFO source=server.go:626 msg="llama runner started in 5.77 seconds"
+Dec 27 16:37:57 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:37:57 | 200 |  6.591417148s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:38:38 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:38:38 | 200 |  605.889284ms |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:39:26 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:39:26 | 200 |  8.232584114s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:40:41 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:40:41 | 200 |  6.235198023s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:41:35 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:41:35 | 200 |  2.848868536s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:42:34 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:42:34 | 200 |   5.35050044s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:44:48 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:44:48 | 200 |  4.922310641s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:45:27 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:45:27 | 200 |   2.44683618s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:47:39 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:47:39 | 200 |  3.946344219s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:48:47 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:48:47 | 200 |  4.527828229s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:50:09 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:50:09 | 200 |      16.397µs |       127.0.0.1 | HEAD     "/"
+Dec 27 16:50:09 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:50:09 | 200 |      62.763µs |       127.0.0.1 | GET      "/api/ps"
+Dec 27 16:50:22 launchpad ollama[1503]: time=2024-12-27T16:50:22.814-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.007-08:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.9 GiB"
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.772-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10401611776 required="9.2 GiB"
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.772-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.773-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.774-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 45797"
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.774-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.774-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.774-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 16:50:23 launchpad ollama[18824]: INFO [main] build info | build=0 commit="unknown" tid="140337552855040" timestamp=1735347023
+Dec 27 16:50:23 launchpad ollama[18824]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140337552855040" timestamp=1735347023 total_threads=16
+Dec 27 16:50:23 launchpad ollama[18824]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45797" tid="140337552855040" timestamp=1735347023
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - type  f32:   81 tensors
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - type q4_0:  281 tensors
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 3
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V2
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: vocab type       = SPM
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 32016
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 5120
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 40
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_head           = 40
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 40
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 1
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 13824
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: model type       = 13B
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: model params     = 13.02 B
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: general.name     = codellama
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 1 ''
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 2 ''
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: UNK token        = 0 ''
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: max token length = 48
+Dec 27 16:50:23 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 16:50:23 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 16:50:23 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 16:50:23 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 27 16:50:24 launchpad ollama[1503]: time=2024-12-27T16:50:24.095-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 16:50:24 launchpad ollama[1503]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 27 16:50:24 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 16:50:24 launchpad ollama[1503]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 27 16:50:24 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 27 16:50:24 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 2048
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 16:50:25 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1286
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 16:50:25 launchpad ollama[18824]: INFO [main] model loaded | tid="140337552855040" timestamp=1735347025
+Dec 27 16:50:25 launchpad ollama[1503]: time=2024-12-27T16:50:25.350-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.58 seconds"
+Dec 27 16:50:25 launchpad ollama[18824]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=21795 n_keep=4 n_left=2044 n_shift=1022 tid="140337552855040" timestamp=1735347025
+Dec 27 16:50:28 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:50:28 | 200 |      17.572µs |       127.0.0.1 | HEAD     "/"
+Dec 27 16:50:28 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:50:28 | 200 |      16.007µs |       127.0.0.1 | GET      "/api/ps"
+Dec 27 16:50:33 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:50:33 | 200 | 11.019714285s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:50:54 launchpad ollama[1503]: time=2024-12-27T16:50:54.949-08:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="986.6 MiB"
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.597-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10401611776 required="6.2 GiB"
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.597-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.4 GiB" free_swap="68.9 GiB"
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.597-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.598-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39645"
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.599-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.599-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.599-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 16:50:55 launchpad ollama[18958]: INFO [main] build info | build=0 commit="unknown" tid="140418646966272" timestamp=1735347055
+Dec 27 16:50:55 launchpad ollama[18958]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140418646966272" timestamp=1735347055 total_threads=16
+Dec 27 16:50:55 launchpad ollama[18958]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39645" tid="140418646966272" timestamp=1735347055
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - type  f32:   65 tensors
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - type q4_0:  225 tensors
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.849-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 256
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: vocab type       = BPE
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 128256
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 280147
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 4096
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 32
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_head           = 32
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 8
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 4
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 14336
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: model type       = 8B
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: model params     = 8.03 B
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: max token length = 256
+Dec 27 16:50:55 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 16:50:55 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 16:50:55 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 16:50:55 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 27 16:50:56 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 16:50:56 launchpad ollama[1503]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 27 16:50:56 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 8192
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 16:50:56 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1030
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 16:50:56 launchpad ollama[18958]: INFO [main] model loaded | tid="140418646966272" timestamp=1735347056
+Dec 27 16:50:56 launchpad ollama[1503]: time=2024-12-27T16:50:56.853-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Dec 27 16:51:02 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:51:02 | 200 |    7.5564247s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:51:45 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:51:45 | 200 |  3.964105561s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:52:49 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:52:49 | 200 |      79.213µs |       127.0.0.1 | HEAD     "/"
+Dec 27 16:52:49 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:52:49 | 200 |      52.777µs |       127.0.0.1 | GET      "/api/ps"
+Dec 27 16:53:39 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:53:39 | 200 |      16.752µs |       127.0.0.1 | HEAD     "/"
+Dec 27 16:53:39 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:53:39 | 200 |      19.085µs |       127.0.0.1 | GET      "/api/ps"
+Dec 27 16:53:50 launchpad ollama[1503]: time=2024-12-27T16:53:50.288-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:53:50 launchpad ollama[1503]: time=2024-12-27T16:53:50.470-08:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.9 GiB"
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.223-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10401611776 required="9.2 GiB"
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.223-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.224-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.225-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 45217"
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.226-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.226-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.226-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 16:53:51 launchpad ollama[19486]: INFO [main] build info | build=0 commit="unknown" tid="140631915614208" timestamp=1735347231
+Dec 27 16:53:51 launchpad ollama[19486]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140631915614208" timestamp=1735347231 total_threads=16
+Dec 27 16:53:51 launchpad ollama[19486]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45217" tid="140631915614208" timestamp=1735347231
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - type  f32:   81 tensors
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - type q4_0:  281 tensors
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 3
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V2
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: vocab type       = SPM
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 32016
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 5120
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 40
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_head           = 40
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 40
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 1
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 13824
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: model type       = 13B
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: model params     = 13.02 B
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: general.name     = codellama
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 1 ''
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 2 ''
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: UNK token        = 0 ''
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: max token length = 48
+Dec 27 16:53:51 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 16:53:51 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 16:53:51 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 16:53:51 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.541-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 2048
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 16:53:52 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1286
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 16:53:52 launchpad ollama[19486]: INFO [main] model loaded | tid="140631915614208" timestamp=1735347232
+Dec 27 16:53:52 launchpad ollama[1503]: time=2024-12-27T16:53:52.545-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 27 16:53:52 launchpad ollama[19486]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=22211 n_keep=4 n_left=2044 n_shift=1022 tid="140631915614208" timestamp=1735347232
+Dec 27 16:53:53 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:53:53 | 200 |      24.294µs |       127.0.0.1 | HEAD     "/"
+Dec 27 16:53:53 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:53:53 | 200 |      20.127µs |       127.0.0.1 | GET      "/api/ps"
+Dec 27 16:54:07 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:54:07 | 200 | 17.579127342s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 17:17:01 launchpad ollama[1503]: time=2024-12-27T17:17:01.944-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.149-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10266869760 required="9.2 GiB"
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.149-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.149-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.150-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 37637"
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.151-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.151-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.151-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 17:17:02 launchpad ollama[26672]: INFO [main] build info | build=0 commit="unknown" tid="140471900819456" timestamp=1735348622
+Dec 27 17:17:02 launchpad ollama[26672]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140471900819456" timestamp=1735348622 total_threads=16
+Dec 27 17:17:02 launchpad ollama[26672]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37637" tid="140471900819456" timestamp=1735348622
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - type  f32:   81 tensors
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - type q4_0:  281 tensors
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 3
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V2
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: vocab type       = SPM
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 32016
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 5120
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 40
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_head           = 40
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 40
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 1
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 13824
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: model type       = 13B
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: model params     = 13.02 B
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: general.name     = codellama
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 1 ''
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 2 ''
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: UNK token        = 0 ''
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: max token length = 48
+Dec 27 17:17:02 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 17:17:02 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 17:17:02 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 17:17:02 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.465-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 2048
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 17:17:03 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1286
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 17:17:03 launchpad ollama[26672]: INFO [main] model loaded | tid="140471900819456" timestamp=1735348623
+Dec 27 17:17:03 launchpad ollama[1503]: time=2024-12-27T17:17:03.721-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.57 seconds"
+Dec 27 17:17:03 launchpad ollama[26672]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=22719 n_keep=4 n_left=2044 n_shift=1022 tid="140471900819456" timestamp=1735348623
+Dec 27 17:17:09 launchpad ollama[1503]: [GIN] 2024/12/27 - 17:17:09 | 200 |  7.869648553s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 17:19:09 launchpad ollama[1503]: time=2024-12-27T17:19:09.739-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 17:19:09 launchpad ollama[26672]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=22918 n_keep=4 n_left=2044 n_shift=1022 tid="140471900819456" timestamp=1735348749
+Dec 27 17:19:27 launchpad ollama[1503]: [GIN] 2024/12/27 - 17:19:27 | 200 | 17.365185082s |       127.0.0.1 | POST     "/api/generate"
+Dec 30 09:35:26 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 30 09:35:27 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 30 09:35:27 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 30 09:35:27 launchpad systemd[1]: ollama.service: Consumed 6min 45.323s CPU time, 12.4G memory peak, 11.4G read from disk, 508.1M written to disk.
+-- Boot 4a5d80c7b4704b6ca2c70beb9642e6c5 --
+Dec 30 09:36:03 launchpad systemd[1]: Starting Server for local large language models...
+Dec 30 09:36:03 launchpad systemd[1]: Started Server for local large language models.
+Dec 30 09:36:03 launchpad ollama[1531]: 2024/12/30 09:36:03 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 30 09:36:03 launchpad ollama[1531]: time=2024-12-30T09:36:03.337-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 30 09:36:03 launchpad ollama[1531]: time=2024-12-30T09:36:03.343-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 30 09:36:03 launchpad ollama[1531]: time=2024-12-30T09:36:03.344-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 30 09:36:03 launchpad ollama[1531]: time=2024-12-30T09:36:03.344-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3263265929/runners
+Dec 30 09:36:06 launchpad ollama[1531]: time=2024-12-30T09:36:06.288-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Dec 30 09:36:06 launchpad ollama[1531]: time=2024-12-30T09:36:06.289-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 30 09:36:06 launchpad ollama[1531]: time=2024-12-30T09:36:06.289-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 30 09:36:06 launchpad ollama[1531]: time=2024-12-30T09:36:06.290-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 30 09:36:06 launchpad ollama[1531]: time=2024-12-30T09:36:06.290-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 30 09:36:06 launchpad ollama[1531]: time=2024-12-30T09:36:06.525-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 01 08:26:01 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 01 08:26:01 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 01 08:26:01 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 01 08:26:01 launchpad systemd[1]: ollama.service: Consumed 4.081s CPU time, 785.6M memory peak, 233.1M read from disk, 508.1M written to disk.
+-- Boot 260c13a7c30249458f47f3db2bdc3453 --
+Jan 01 08:26:35 launchpad systemd[1]: Starting Server for local large language models...
+Jan 01 08:26:35 launchpad systemd[1]: Started Server for local large language models.
+Jan 01 08:26:35 launchpad ollama[1534]: 2025/01/01 08:26:35 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 01 08:26:35 launchpad ollama[1534]: time=2025-01-01T08:26:35.205-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 01 08:26:35 launchpad ollama[1534]: time=2025-01-01T08:26:35.209-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 01 08:26:35 launchpad ollama[1534]: time=2025-01-01T08:26:35.210-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 01 08:26:35 launchpad ollama[1534]: time=2025-01-01T08:26:35.211-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2383843368/runners
+Jan 01 08:26:38 launchpad ollama[1534]: time=2025-01-01T08:26:38.161-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 01 08:26:38 launchpad ollama[1534]: time=2025-01-01T08:26:38.161-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 01 08:26:38 launchpad ollama[1534]: time=2025-01-01T08:26:38.161-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 01 08:26:38 launchpad ollama[1534]: time=2025-01-01T08:26:38.162-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 01 08:26:38 launchpad ollama[1534]: time=2025-01-01T08:26:38.162-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 01 08:26:38 launchpad ollama[1534]: time=2025-01-01T08:26:38.370-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 01 17:02:39 launchpad ollama[1534]: [GIN] 2025/01/01 - 17:02:39 | 200 |     561.176µs |       127.0.0.1 | HEAD     "/"
+Jan 01 17:02:39 launchpad ollama[1534]: [GIN] 2025/01/01 - 17:02:39 | 200 |    6.095628ms |       127.0.0.1 | POST     "/api/show"
+Jan 01 17:02:39 launchpad ollama[1534]: time=2025-01-01T17:02:39.875-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.036-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="50.0 GiB" free_swap="68.9 GiB"
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.036-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=0 layers.split="" memory.available="[253.1 MiB]" memory.gpu_overhead="0 B" memory.required.full="8.3 GiB" memory.required.partial="0 B" memory.required.kv="1.6 GiB" memory.required.allocations="[0 B]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.038-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2383843368/runners/cpu_avx2/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --no-mmap --parallel 1 --port 42311"
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.038-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.038-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.038-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 01 17:02:40 launchpad ollama[80568]: INFO [main] build info | build=0 commit="unknown" tid="140151064412864" timestamp=1735779760
+Jan 01 17:02:40 launchpad ollama[80568]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140151064412864" timestamp=1735779760 total_threads=16
+Jan 01 17:02:40 launchpad ollama[80568]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42311" tid="140151064412864" timestamp=1735779760
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - type  f32:   81 tensors
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - type q4_0:  281 tensors
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - type q6_K:    1 tensors
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_vocab: special tokens cache size = 3
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: format           = GGUF V2
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: arch             = llama
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: vocab type       = SPM
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_vocab          = 32016
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_merges         = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: vocab_only       = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_ctx_train      = 16384
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_embd           = 5120
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_layer          = 40
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_head           = 40
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_head_kv        = 40
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_rot            = 128
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_swa            = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_gqa            = 1
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_ff             = 13824
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_expert         = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_expert_used    = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: causal attn      = 1
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: pooling type     = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: rope type        = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: rope scaling     = linear
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: freq_base_train  = 1000000.0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: freq_scale_train = 1
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: ssm_d_state      = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: model type       = 13B
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: model ftype      = Q4_0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: model params     = 13.02 B
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: general.name     = codellama
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: BOS token        = 1 ''
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: EOS token        = 2 ''
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: UNK token        = 0 ''
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: SUF token        = 32008 '▁'
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: MID token        = 32009 '▁'
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: EOT token        = 32010 '▁'
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: max token length = 48
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_tensors: ggml ctx size =    0.17 MiB
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.289-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 01 17:02:48 launchpad ollama[1534]: llama_new_context_with_model: n_ctx      = 2048
+Jan 01 17:02:48 launchpad ollama[1534]: llama_new_context_with_model: n_batch    = 512
+Jan 01 17:02:48 launchpad ollama[1534]: llama_new_context_with_model: n_ubatch   = 512
+Jan 01 17:02:48 launchpad ollama[1534]: llama_new_context_with_model: flash_attn = 0
+Jan 01 17:02:48 launchpad ollama[1534]: llama_new_context_with_model: freq_base  = 1000000.0
+Jan 01 17:02:48 launchpad ollama[1534]: llama_new_context_with_model: freq_scale = 1
+Jan 01 17:02:49 launchpad ollama[1534]: llama_kv_cache_init:        CPU KV buffer size =  1600.00 MiB
+Jan 01 17:02:49 launchpad ollama[1534]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Jan 01 17:02:49 launchpad ollama[1534]: llama_new_context_with_model:        CPU  output buffer size =     0.14 MiB
+Jan 01 17:02:49 launchpad ollama[1534]: llama_new_context_with_model:        CPU compute buffer size =   204.01 MiB
+Jan 01 17:02:49 launchpad ollama[1534]: llama_new_context_with_model: graph nodes  = 1286
+Jan 01 17:02:49 launchpad ollama[1534]: llama_new_context_with_model: graph splits = 1
+Jan 01 17:02:49 launchpad ollama[80568]: INFO [main] model loaded | tid="140151064412864" timestamp=1735779769
+Jan 01 17:02:49 launchpad ollama[1534]: time=2025-01-01T17:02:49.567-08:00 level=INFO source=server.go:626 msg="llama runner started in 9.53 seconds"
+Jan 01 17:02:49 launchpad ollama[1534]: [GIN] 2025/01/01 - 17:02:49 | 200 |  9.695901175s |       127.0.0.1 | POST     "/api/generate"
+Jan 01 17:03:12 launchpad ollama[1534]: time=2025-01-01T17:03:12.114-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 01 17:04:15 launchpad ollama[1534]: [GIN] 2025/01/01 - 17:04:15 | 200 |          1m3s |       127.0.0.1 | POST     "/api/chat"
+Jan 01 17:09:15 launchpad ollama[1534]: cuda driver library failed to get device context 2time=2025-01-01T17:09:15.288-08:00 level=WARN source=gpu.go:400 msg="error looking up nvidia GPU memory"
+Jan 01 17:09:15 launchpad ollama[1534]: cuda driver library failed to get device context 2time=2025-01-01T17:09:15.656-08:00 level=WARN source=gpu.go:400 msg="error looking up nvidia GPU memory"
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.769-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.930-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="50.2 GiB" free_swap="68.9 GiB"
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.931-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=0 layers.split="" memory.available="[1.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="8.3 GiB" memory.required.partial="0 B" memory.required.kv="1.6 GiB" memory.required.allocations="[0 B]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.932-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2383843368/runners/cpu_avx2/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --no-mmap --parallel 1 --port 39691"
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.932-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.932-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.932-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 01 18:58:56 launchpad ollama[96633]: INFO [main] build info | build=0 commit="unknown" tid="139871342240448" timestamp=1735786736
+Jan 01 18:58:56 launchpad ollama[96633]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139871342240448" timestamp=1735786736 total_threads=16
+Jan 01 18:58:56 launchpad ollama[96633]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39691" tid="139871342240448" timestamp=1735786736
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - type  f32:   81 tensors
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - type q4_0:  281 tensors
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - type q6_K:    1 tensors
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_vocab: special tokens cache size = 3
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: format           = GGUF V2
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: arch             = llama
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: vocab type       = SPM
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_vocab          = 32016
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_merges         = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: vocab_only       = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_ctx_train      = 16384
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_embd           = 5120
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_layer          = 40
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_head           = 40
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_head_kv        = 40
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_rot            = 128
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_swa            = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_gqa            = 1
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_ff             = 13824
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_expert         = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_expert_used    = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: causal attn      = 1
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: pooling type     = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: rope type        = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: rope scaling     = linear
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: freq_base_train  = 1000000.0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: freq_scale_train = 1
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: ssm_d_state      = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: model type       = 13B
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: model ftype      = Q4_0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: model params     = 13.02 B
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: general.name     = codellama
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: BOS token        = 1 ''
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: EOS token        = 2 ''
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: UNK token        = 0 ''
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: SUF token        = 32008 '▁'
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: MID token        = 32009 '▁'
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: EOT token        = 32010 '▁'
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: max token length = 48
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_tensors: ggml ctx size =    0.17 MiB
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Jan 01 18:58:57 launchpad ollama[1534]: time=2025-01-01T18:58:57.184-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 01 18:58:58 launchpad ollama[1534]: llama_new_context_with_model: n_ctx      = 2048
+Jan 01 18:58:58 launchpad ollama[1534]: llama_new_context_with_model: n_batch    = 512
+Jan 01 18:58:58 launchpad ollama[1534]: llama_new_context_with_model: n_ubatch   = 512
+Jan 01 18:58:58 launchpad ollama[1534]: llama_new_context_with_model: flash_attn = 0
+Jan 01 18:58:58 launchpad ollama[1534]: llama_new_context_with_model: freq_base  = 1000000.0
+Jan 01 18:58:58 launchpad ollama[1534]: llama_new_context_with_model: freq_scale = 1
+Jan 01 18:58:59 launchpad ollama[1534]: llama_kv_cache_init:        CPU KV buffer size =  1600.00 MiB
+Jan 01 18:58:59 launchpad ollama[1534]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Jan 01 18:58:59 launchpad ollama[1534]: llama_new_context_with_model:        CPU  output buffer size =     0.14 MiB
+Jan 01 18:58:59 launchpad ollama[1534]: llama_new_context_with_model:        CPU compute buffer size =   204.01 MiB
+Jan 01 18:58:59 launchpad ollama[1534]: llama_new_context_with_model: graph nodes  = 1286
+Jan 01 18:58:59 launchpad ollama[1534]: llama_new_context_with_model: graph splits = 1
+Jan 01 18:58:59 launchpad ollama[96633]: INFO [main] model loaded | tid="139871342240448" timestamp=1735786739
+Jan 01 18:58:59 launchpad ollama[1534]: time=2025-01-01T18:58:59.693-08:00 level=INFO source=server.go:626 msg="llama runner started in 2.76 seconds"
+Jan 01 19:00:17 launchpad ollama[1534]: [GIN] 2025/01/01 - 19:00:17 | 200 |         1m20s |       127.0.0.1 | POST     "/api/chat"
+Jan 01 19:00:55 launchpad ollama[1534]: time=2025-01-01T19:00:55.488-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 01 19:01:16 launchpad ollama[1534]: [GIN] 2025/01/01 - 19:01:16 | 200 | 21.465450198s |       127.0.0.1 | POST     "/api/chat"
+Jan 01 19:03:15 launchpad ollama[1534]: time=2025-01-01T19:03:15.006-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 01 19:03:44 launchpad ollama[1534]: [GIN] 2025/01/01 - 19:03:44 | 200 |  29.17834599s |       127.0.0.1 | POST     "/api/chat"
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.111-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.268-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.0 GiB" free_swap="68.9 GiB"
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.268-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=2 layers.split="" memory.available="[1.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="1.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[1.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.269-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2383843368/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 2 --parallel 1 --port 41601"
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.269-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.269-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.269-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 01 19:10:40 launchpad ollama[100100]: INFO [main] build info | build=0 commit="unknown" tid="140659831738368" timestamp=1735787440
+Jan 01 19:10:40 launchpad ollama[100100]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140659831738368" timestamp=1735787440 total_threads=16
+Jan 01 19:10:40 launchpad ollama[100100]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41601" tid="140659831738368" timestamp=1735787440
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - type  f32:   81 tensors
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - type q4_0:  281 tensors
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - type q6_K:    1 tensors
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_vocab: special tokens cache size = 3
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: format           = GGUF V2
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: arch             = llama
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: vocab type       = SPM
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_vocab          = 32016
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_merges         = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: vocab_only       = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_ctx_train      = 16384
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_embd           = 5120
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_layer          = 40
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_head           = 40
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_head_kv        = 40
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_rot            = 128
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_swa            = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_gqa            = 1
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_ff             = 13824
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_expert         = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_expert_used    = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: causal attn      = 1
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: pooling type     = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: rope type        = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: rope scaling     = linear
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: freq_base_train  = 1000000.0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: freq_scale_train = 1
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: ssm_d_state      = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: model type       = 13B
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: model ftype      = Q4_0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: model params     = 13.02 B
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: general.name     = codellama
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: BOS token        = 1 ''
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: EOS token        = 2 ''
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: UNK token        = 0 ''
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: SUF token        = 32008 '▁'
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: MID token        = 32009 '▁'
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: EOT token        = 32010 '▁'
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: max token length = 48
+Jan 01 19:10:40 launchpad ollama[1534]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 01 19:10:40 launchpad ollama[1534]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 01 19:10:40 launchpad ollama[1534]: ggml_cuda_init: found 1 CUDA devices:
+Jan 01 19:10:40 launchpad ollama[1534]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.520-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_tensors: offloading 2 repeating layers to GPU
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_tensors: offloaded 2/41 layers to GPU
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_tensors:      CUDA0 buffer size =   340.39 MiB
+Jan 01 19:10:40 launchpad ollama[1534]: llama_new_context_with_model: n_ctx      = 2048
+Jan 01 19:10:40 launchpad ollama[1534]: llama_new_context_with_model: n_batch    = 512
+Jan 01 19:10:40 launchpad ollama[1534]: llama_new_context_with_model: n_ubatch   = 512
+Jan 01 19:10:40 launchpad ollama[1534]: llama_new_context_with_model: flash_attn = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llama_new_context_with_model: freq_base  = 1000000.0
+Jan 01 19:10:40 launchpad ollama[1534]: llama_new_context_with_model: freq_scale = 1
+Jan 01 19:10:41 launchpad ollama[1534]: llama_kv_cache_init:  CUDA_Host KV buffer size =  1520.00 MiB
+Jan 01 19:10:41 launchpad ollama[1534]: llama_kv_cache_init:      CUDA0 KV buffer size =    80.00 MiB
+Jan 01 19:10:41 launchpad ollama[1534]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Jan 01 19:10:41 launchpad ollama[1534]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Jan 01 19:10:41 launchpad ollama[1534]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Jan 01 19:10:41 launchpad ollama[1534]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 01 19:10:41 launchpad ollama[1534]: llama_new_context_with_model: graph nodes  = 1286
+Jan 01 19:10:41 launchpad ollama[1534]: llama_new_context_with_model: graph splits = 422
+Jan 01 19:10:41 launchpad ollama[100100]: INFO [main] model loaded | tid="140659831738368" timestamp=1735787441
+Jan 01 19:10:41 launchpad ollama[1534]: time=2025-01-01T19:10:41.533-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Jan 01 19:11:25 launchpad ollama[1534]: [GIN] 2025/01/01 - 19:11:25 | 200 | 45.575448992s |       127.0.0.1 | POST     "/api/chat"
+Jan 01 19:16:30 launchpad ollama[1534]: time=2025-01-01T19:16:30.837-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.146357639 model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c
+Jan 01 19:16:31 launchpad ollama[1534]: time=2025-01-01T19:16:31.086-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.395657879 model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c
+Jan 01 19:16:31 launchpad ollama[1534]: time=2025-01-01T19:16:31.337-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.646032647 model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c
+Jan 03 18:31:59 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 03 18:31:59 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 03 18:31:59 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 03 18:31:59 launchpad systemd[1]: ollama.service: Consumed 31min 9.751s CPU time, 16.1G memory peak, 7.1G read from disk, 508.1M written to disk.
+-- Boot 68bc23759a4a4fc28942066d48b2dcba --
+Jan 03 18:32:35 launchpad systemd[1]: Starting Server for local large language models...
+Jan 03 18:32:35 launchpad systemd[1]: Started Server for local large language models.
+Jan 03 18:32:35 launchpad ollama[1531]: 2025/01/03 18:32:35 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 03 18:32:35 launchpad ollama[1531]: time=2025-01-03T18:32:35.336-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 03 18:32:35 launchpad ollama[1531]: time=2025-01-03T18:32:35.340-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 03 18:32:35 launchpad ollama[1531]: time=2025-01-03T18:32:35.342-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 03 18:32:35 launchpad ollama[1531]: time=2025-01-03T18:32:35.343-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2922762215/runners
+Jan 03 18:32:38 launchpad ollama[1531]: time=2025-01-03T18:32:38.275-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 03 18:32:38 launchpad ollama[1531]: time=2025-01-03T18:32:38.275-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 03 18:32:38 launchpad ollama[1531]: time=2025-01-03T18:32:38.276-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 03 18:32:38 launchpad ollama[1531]: time=2025-01-03T18:32:38.276-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 03 18:32:38 launchpad ollama[1531]: time=2025-01-03T18:32:38.276-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 03 18:32:38 launchpad ollama[1531]: time=2025-01-03T18:32:38.480-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 06 15:38:43 launchpad ollama[1531]: [GIN] 2025/01/06 - 15:38:43 | 200 |     781.325µs |       127.0.0.1 | HEAD     "/"
+Jan 06 15:38:43 launchpad ollama[1531]: [GIN] 2025/01/06 - 15:38:43 | 200 |   17.251415ms |       127.0.0.1 | POST     "/api/show"
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.715-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8739618816 required="6.2 GiB"
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.715-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.5 GiB" free_swap="68.9 GiB"
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.715-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.717-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2922762215/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40151"
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.717-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.717-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.717-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 06 15:38:43 launchpad ollama[401857]: INFO [main] build info | build=0 commit="unknown" tid="139790765953024" timestamp=1736206723
+Jan 06 15:38:43 launchpad ollama[401857]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139790765953024" timestamp=1736206723 total_threads=16
+Jan 06 15:38:43 launchpad ollama[401857]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40151" tid="139790765953024" timestamp=1736206723
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - type  f32:   65 tensors
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - type q4_0:  225 tensors
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - type q6_K:    1 tensors
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.968-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_vocab: special tokens cache size = 256
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: arch             = llama
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: vocab type       = BPE
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_vocab          = 128256
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_merges         = 280147
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: vocab_only       = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_embd           = 4096
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_layer          = 32
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_head           = 32
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_head_kv        = 8
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_rot            = 128
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_swa            = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_gqa            = 4
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_ff             = 14336
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_expert         = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_expert_used    = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: causal attn      = 1
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: pooling type     = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: rope type        = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: rope scaling     = linear
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: freq_scale_train = 1
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: ssm_d_state      = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: model type       = 8B
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: model ftype      = Q4_0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: model params     = 8.03 B
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: max token length = 256
+Jan 06 15:38:44 launchpad ollama[1531]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 06 15:38:44 launchpad ollama[1531]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 06 15:38:44 launchpad ollama[1531]: ggml_cuda_init: found 1 CUDA devices:
+Jan 06 15:38:44 launchpad ollama[1531]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 06 15:38:48 launchpad ollama[1531]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 06 15:38:48 launchpad ollama[1531]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 06 15:38:48 launchpad ollama[1531]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 06 15:38:48 launchpad ollama[1531]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 06 15:38:48 launchpad ollama[1531]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: n_ctx      = 8192
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: n_batch    = 512
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: n_ubatch   = 512
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: flash_attn = 0
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: freq_scale = 1
+Jan 06 15:38:49 launchpad ollama[1531]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: graph nodes  = 1030
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: graph splits = 2
+Jan 06 15:38:49 launchpad ollama[401857]: INFO [main] model loaded | tid="139790765953024" timestamp=1736206729
+Jan 06 15:38:49 launchpad ollama[1531]: time=2025-01-06T15:38:49.486-08:00 level=INFO source=server.go:626 msg="llama runner started in 5.77 seconds"
+Jan 06 15:38:49 launchpad ollama[1531]: [GIN] 2025/01/06 - 15:38:49 | 200 |  5.951679064s |       127.0.0.1 | POST     "/api/generate"
+Jan 06 15:43:00 launchpad ollama[1531]: [GIN] 2025/01/06 - 15:43:00 | 200 |  7.356289318s |       127.0.0.1 | POST     "/api/chat"
+Jan 06 15:45:54 launchpad ollama[1531]: [GIN] 2025/01/06 - 15:45:54 | 200 |  8.194895954s |       127.0.0.1 | POST     "/api/chat"
+Jan 06 15:50:07 launchpad ollama[1531]: [GIN] 2025/01/06 - 15:50:07 | 200 |  6.660291288s |       127.0.0.1 | POST     "/api/chat"
+-- Boot f57f0e33b6b440d58dc36136ed7e598b --
+Jan 06 19:22:05 launchpad systemd[1]: Starting Server for local large language models...
+Jan 06 19:22:05 launchpad systemd[1]: Started Server for local large language models.
+Jan 06 19:22:05 launchpad ollama[1622]: 2025/01/06 19:22:05 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 06 19:22:05 launchpad ollama[1622]: time=2025-01-06T19:22:05.372-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 06 19:22:05 launchpad ollama[1622]: time=2025-01-06T19:22:05.376-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 06 19:22:05 launchpad ollama[1622]: time=2025-01-06T19:22:05.378-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 06 19:22:05 launchpad ollama[1622]: time=2025-01-06T19:22:05.380-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama448366513/runners
+Jan 06 19:22:08 launchpad ollama[1622]: time=2025-01-06T19:22:08.273-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 06 19:22:08 launchpad ollama[1622]: time=2025-01-06T19:22:08.273-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 06 19:22:08 launchpad ollama[1622]: time=2025-01-06T19:22:08.273-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 06 19:22:08 launchpad ollama[1622]: time=2025-01-06T19:22:08.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 06 19:22:08 launchpad ollama[1622]: time=2025-01-06T19:22:08.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 06 19:22:10 launchpad ollama[1622]: time=2025-01-06T19:22:10.028-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Jan 06 19:23:52 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 06 19:25:22 launchpad systemd[1]: ollama.service: State 'stop-sigterm' timed out. Killing.
+Jan 06 19:25:22 launchpad systemd[1]: ollama.service: Killing process 1622 (.ollama-wrapped) with signal SIGKILL.
+Jan 06 19:25:22 launchpad systemd[1]: ollama.service: Killing process 1666 (.ollama-wrapped) with signal SIGKILL.
+Jan 06 19:26:52 launchpad systemd[1]: ollama.service: Processes still around after SIGKILL. Ignoring.
+Jan 06 19:28:23 launchpad systemd[1]: ollama.service: State 'final-sigterm' timed out. Killing.
+Jan 06 19:28:23 launchpad systemd[1]: ollama.service: Killing process 1622 (.ollama-wrapped) with signal SIGKILL.
+Jan 06 19:28:23 launchpad systemd[1]: ollama.service: Killing process 1666 (.ollama-wrapped) with signal SIGKILL.
+-- Boot 57b5dff306f2462f905c3306555265b4 --
+Jan 06 19:30:26 launchpad systemd[1]: Starting Server for local large language models...
+Jan 06 19:30:26 launchpad systemd[1]: Started Server for local large language models.
+Jan 06 19:30:26 launchpad ollama[1534]: 2025/01/06 19:30:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 06 19:30:26 launchpad ollama[1534]: time=2025-01-06T19:30:26.355-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 06 19:30:26 launchpad ollama[1534]: time=2025-01-06T19:30:26.360-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 06 19:30:26 launchpad ollama[1534]: time=2025-01-06T19:30:26.360-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 06 19:30:26 launchpad ollama[1534]: time=2025-01-06T19:30:26.362-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2024859506/runners
+Jan 06 19:30:29 launchpad ollama[1534]: time=2025-01-06T19:30:29.265-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 06 19:30:29 launchpad ollama[1534]: time=2025-01-06T19:30:29.265-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 06 19:30:29 launchpad ollama[1534]: time=2025-01-06T19:30:29.265-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 06 19:30:29 launchpad ollama[1534]: time=2025-01-06T19:30:29.266-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 06 19:30:29 launchpad ollama[1534]: time=2025-01-06T19:30:29.266-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 06 19:30:29 launchpad ollama[1534]: time=2025-01-06T19:30:29.498-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 06 19:37:04 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 06 19:37:04 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 06 19:37:04 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 06 19:37:04 launchpad systemd[1]: ollama.service: Consumed 3.346s CPU time, 786.2M memory peak, 233.4M read from disk, 508.1M written to disk.
+-- Boot 9d5500554b66400fb07cf0b1edd61af5 --
+Jan 07 07:34:16 launchpad systemd[1]: Starting Server for local large language models...
+Jan 07 07:34:16 launchpad systemd[1]: Started Server for local large language models.
+Jan 07 07:34:16 launchpad ollama[1540]: 2025/01/07 07:34:16 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 07 07:34:16 launchpad ollama[1540]: time=2025-01-07T07:34:16.277-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 07 07:34:16 launchpad ollama[1540]: time=2025-01-07T07:34:16.284-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 07 07:34:16 launchpad ollama[1540]: time=2025-01-07T07:34:16.285-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 07 07:34:16 launchpad ollama[1540]: time=2025-01-07T07:34:16.286-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama758438586/runners
+Jan 07 07:34:19 launchpad ollama[1540]: time=2025-01-07T07:34:19.365-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Jan 07 07:34:19 launchpad ollama[1540]: time=2025-01-07T07:34:19.366-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 07 07:34:19 launchpad ollama[1540]: time=2025-01-07T07:34:19.366-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 07:34:19 launchpad ollama[1540]: time=2025-01-07T07:34:19.366-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 07:34:19 launchpad ollama[1540]: time=2025-01-07T07:34:19.366-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 07:34:19 launchpad ollama[1540]: time=2025-01-07T07:34:19.554-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 07 12:53:59 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 07 12:53:59 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 07 12:53:59 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 07 12:53:59 launchpad systemd[1]: ollama.service: Consumed 3.700s CPU time, 787.8M memory peak, 233M read from disk, 508.1M written to disk.
+-- Boot f682186d89a94a0bba220de6233eee6e --
+Jan 07 12:54:31 launchpad systemd[1]: Starting Server for local large language models...
+Jan 07 12:54:31 launchpad systemd[1]: Started Server for local large language models.
+Jan 07 12:54:31 launchpad ollama[1532]: 2025/01/07 12:54:31 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 07 12:54:31 launchpad ollama[1532]: time=2025-01-07T12:54:31.428-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 07 12:54:31 launchpad ollama[1532]: time=2025-01-07T12:54:31.434-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 07 12:54:31 launchpad ollama[1532]: time=2025-01-07T12:54:31.435-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 07 12:54:31 launchpad ollama[1532]: time=2025-01-07T12:54:31.435-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama446181134/runners
+Jan 07 12:54:34 launchpad ollama[1532]: time=2025-01-07T12:54:34.396-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Jan 07 12:54:34 launchpad ollama[1532]: time=2025-01-07T12:54:34.396-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 07 12:54:34 launchpad ollama[1532]: time=2025-01-07T12:54:34.396-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 12:54:34 launchpad ollama[1532]: time=2025-01-07T12:54:34.397-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 12:54:34 launchpad ollama[1532]: time=2025-01-07T12:54:34.397-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 12:54:34 launchpad ollama[1532]: time=2025-01-07T12:54:34.622-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 07 13:18:52 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 07 13:18:52 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 07 13:18:52 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 07 13:18:52 launchpad systemd[1]: ollama.service: Consumed 3.414s CPU time, 786M memory peak, 233M read from disk, 508.1M written to disk.
+-- Boot ec29c3ea9d1f4afb95b8205796087b74 --
+Jan 07 13:19:24 launchpad systemd[1]: Starting Server for local large language models...
+Jan 07 13:19:24 launchpad systemd[1]: Started Server for local large language models.
+Jan 07 13:19:24 launchpad ollama[1526]: 2025/01/07 13:19:24 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 07 13:19:24 launchpad ollama[1526]: time=2025-01-07T13:19:24.895-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 07 13:19:24 launchpad ollama[1526]: time=2025-01-07T13:19:24.901-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 07 13:19:24 launchpad ollama[1526]: time=2025-01-07T13:19:24.902-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 07 13:19:24 launchpad ollama[1526]: time=2025-01-07T13:19:24.905-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2599412483/runners
+Jan 07 13:19:27 launchpad ollama[1526]: time=2025-01-07T13:19:27.799-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Jan 07 13:19:27 launchpad ollama[1526]: time=2025-01-07T13:19:27.800-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 07 13:19:27 launchpad ollama[1526]: time=2025-01-07T13:19:27.800-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 13:19:27 launchpad ollama[1526]: time=2025-01-07T13:19:27.800-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 13:19:27 launchpad ollama[1526]: time=2025-01-07T13:19:27.800-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 13:19:28 launchpad ollama[1526]: time=2025-01-07T13:19:28.015-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.0 GiB"
+Jan 07 18:52:45 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 07 18:52:45 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 07 18:52:45 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 07 18:52:45 launchpad systemd[1]: ollama.service: Consumed 3.632s CPU time, 786M memory peak, 233.2M read from disk, 508.1M written to disk.
+-- Boot 855489849095400cab96d7df50fa4664 --
+Jan 07 18:53:39 launchpad systemd[1]: Starting Server for local large language models...
+Jan 07 18:53:39 launchpad systemd[1]: Started Server for local large language models.
+Jan 07 18:53:40 launchpad ollama[1595]: 2025/01/07 18:53:40 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 07 18:53:40 launchpad ollama[1595]: time=2025-01-07T18:53:40.034-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 07 18:53:40 launchpad ollama[1595]: time=2025-01-07T18:53:40.042-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 07 18:53:40 launchpad ollama[1595]: time=2025-01-07T18:53:40.043-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 07 18:53:40 launchpad ollama[1595]: time=2025-01-07T18:53:40.045-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2772112966/runners
+Jan 07 18:53:43 launchpad ollama[1595]: time=2025-01-07T18:53:43.063-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 07 18:53:43 launchpad ollama[1595]: time=2025-01-07T18:53:43.064-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 07 18:53:43 launchpad ollama[1595]: time=2025-01-07T18:53:43.064-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 18:53:43 launchpad ollama[1595]: time=2025-01-07T18:53:43.064-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 18:53:43 launchpad ollama[1595]: time=2025-01-07T18:53:43.064-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 18:53:44 launchpad ollama[1595]: time=2025-01-07T18:53:44.819-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Jan 07 18:54:19 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 07 18:54:19 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 07 18:54:20 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 07 18:54:20 launchpad systemd[1]: ollama.service: Consumed 5.283s CPU time, 789M memory peak, 235.9M read from disk, 508.1M written to disk.
+-- Boot 294cf7b93f014947854eb454411dbc1d --
+Jan 07 18:54:51 launchpad systemd[1]: Starting Server for local large language models...
+Jan 07 18:54:51 launchpad systemd[1]: Started Server for local large language models.
+Jan 07 18:54:51 launchpad ollama[1618]: 2025/01/07 18:54:51 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 07 18:54:51 launchpad ollama[1618]: time=2025-01-07T18:54:51.180-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 07 18:54:51 launchpad ollama[1618]: time=2025-01-07T18:54:51.185-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 07 18:54:51 launchpad ollama[1618]: time=2025-01-07T18:54:51.186-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 07 18:54:51 launchpad ollama[1618]: time=2025-01-07T18:54:51.187-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3742342109/runners
+Jan 07 18:54:54 launchpad ollama[1618]: time=2025-01-07T18:54:54.093-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 07 18:54:54 launchpad ollama[1618]: time=2025-01-07T18:54:54.094-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 07 18:54:54 launchpad ollama[1618]: time=2025-01-07T18:54:54.094-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 18:54:54 launchpad ollama[1618]: time=2025-01-07T18:54:54.095-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 18:54:54 launchpad ollama[1618]: time=2025-01-07T18:54:54.095-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 18:54:54 launchpad ollama[1618]: time=2025-01-07T18:54:54.330-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 08 08:55:07 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 08 08:55:07 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 08 08:55:07 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 08 08:55:07 launchpad systemd[1]: ollama.service: Consumed 3.399s CPU time, 785.5M memory peak, 233.1M read from disk, 508.1M written to disk.
+-- Boot 386ee235859d401d9b09ce3536af41c0 --
+Jan 08 08:55:39 launchpad systemd[1]: Starting Server for local large language models...
+Jan 08 08:55:39 launchpad systemd[1]: Started Server for local large language models.
+Jan 08 08:55:39 launchpad ollama[1534]: 2025/01/08 08:55:39 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 08 08:55:39 launchpad ollama[1534]: time=2025-01-08T08:55:39.817-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 08 08:55:39 launchpad ollama[1534]: time=2025-01-08T08:55:39.822-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 08 08:55:39 launchpad ollama[1534]: time=2025-01-08T08:55:39.822-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 08 08:55:39 launchpad ollama[1534]: time=2025-01-08T08:55:39.824-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2571630/runners
+Jan 08 08:55:42 launchpad ollama[1534]: time=2025-01-08T08:55:42.791-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 08 08:55:42 launchpad ollama[1534]: time=2025-01-08T08:55:42.791-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 08 08:55:42 launchpad ollama[1534]: time=2025-01-08T08:55:42.791-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 08 08:55:42 launchpad ollama[1534]: time=2025-01-08T08:55:42.792-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 08 08:55:42 launchpad ollama[1534]: time=2025-01-08T08:55:42.792-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 08 08:55:42 launchpad ollama[1534]: time=2025-01-08T08:55:42.999-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.0 GiB"
+Jan 09 14:19:09 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 09 14:19:09 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 09 14:19:09 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 09 14:19:09 launchpad systemd[1]: ollama.service: Consumed 3.964s CPU time, 786M memory peak, 233.2M read from disk, 508.1M written to disk.
+-- Boot d7b8ec2daadb429b9bb273c7a542d90a --
+Jan 09 14:19:14 launchpad systemd[1]: Starting Server for local large language models...
+Jan 09 14:19:14 launchpad systemd[1]: Started Server for local large language models.
+Jan 09 14:19:14 launchpad ollama[1530]: 2025/01/09 14:19:14 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 09 14:19:14 launchpad ollama[1530]: time=2025-01-09T14:19:14.621-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 09 14:19:14 launchpad ollama[1530]: time=2025-01-09T14:19:14.627-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 09 14:19:14 launchpad ollama[1530]: time=2025-01-09T14:19:14.628-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 09 14:19:14 launchpad ollama[1530]: time=2025-01-09T14:19:14.628-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama782984971/runners
+Jan 09 14:19:17 launchpad ollama[1530]: time=2025-01-09T14:19:17.541-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 09 14:19:17 launchpad ollama[1530]: time=2025-01-09T14:19:17.542-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 09 14:19:17 launchpad ollama[1530]: time=2025-01-09T14:19:17.542-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 09 14:19:17 launchpad ollama[1530]: time=2025-01-09T14:19:17.542-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 09 14:19:17 launchpad ollama[1530]: time=2025-01-09T14:19:17.542-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 09 14:19:17 launchpad ollama[1530]: time=2025-01-09T14:19:17.753-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 12 09:46:00 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 12 09:46:00 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 12 09:46:00 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 12 09:46:00 launchpad systemd[1]: ollama.service: Consumed 4.351s CPU time, 786.2M memory peak, 233.3M read from disk, 508.1M written to disk.
+Jan 12 09:46:15 launchpad systemd[1]: Starting Server for local large language models...
+Jan 12 09:46:15 launchpad systemd[1]: Started Server for local large language models.
+Jan 12 09:46:15 launchpad ollama[430897]: 2025/01/12 09:46:15 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 12 09:46:15 launchpad ollama[430897]: time=2025-01-12T09:46:15.353-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 12 09:46:15 launchpad ollama[430897]: time=2025-01-12T09:46:15.357-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 12 09:46:15 launchpad ollama[430897]: time=2025-01-12T09:46:15.357-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 12 09:46:15 launchpad ollama[430897]: time=2025-01-12T09:46:15.358-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3398906615/runners
+Jan 12 09:46:18 launchpad ollama[430897]: time=2025-01-12T09:46:18.274-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 12 09:46:18 launchpad ollama[430897]: time=2025-01-12T09:46:18.274-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 12 09:46:18 launchpad ollama[430897]: time=2025-01-12T09:46:18.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 09:46:18 launchpad ollama[430897]: time=2025-01-12T09:46:18.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 09:46:18 launchpad ollama[430897]: time=2025-01-12T09:46:18.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 09:46:18 launchpad ollama[430897]: time=2025-01-12T09:46:18.540-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="8.7 GiB"
+Jan 12 12:17:22 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 12 12:17:22 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 12 12:17:22 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 12 12:17:22 launchpad systemd[1]: ollama.service: Consumed 3.492s CPU time, 560.1M memory peak, 7.5M read from disk, 508.1M written to disk.
+-- Boot e311e8cc46c445b8bdb99d06997efb8d --
+Jan 12 12:18:05 launchpad systemd[1]: Starting Server for local large language models...
+Jan 12 12:18:05 launchpad systemd[1]: Started Server for local large language models.
+Jan 12 12:18:05 launchpad ollama[1516]: 2025/01/12 12:18:05 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 12 12:18:05 launchpad ollama[1516]: time=2025-01-12T12:18:05.518-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 12 12:18:05 launchpad ollama[1516]: time=2025-01-12T12:18:05.523-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 12 12:18:05 launchpad ollama[1516]: time=2025-01-12T12:18:05.525-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 12 12:18:05 launchpad ollama[1516]: time=2025-01-12T12:18:05.526-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4005739148/runners
+Jan 12 12:18:08 launchpad ollama[1516]: time=2025-01-12T12:18:08.355-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Jan 12 12:18:08 launchpad ollama[1516]: time=2025-01-12T12:18:08.355-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 12 12:18:08 launchpad ollama[1516]: time=2025-01-12T12:18:08.355-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:18:08 launchpad ollama[1516]: time=2025-01-12T12:18:08.356-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:18:08 launchpad ollama[1516]: time=2025-01-12T12:18:08.356-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:18:10 launchpad ollama[1516]: time=2025-01-12T12:18:10.078-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Jan 12 12:18:25 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 12 12:18:26 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 12 12:18:26 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 12 12:18:26 launchpad systemd[1]: ollama.service: Consumed 5.107s CPU time, 786.4M memory peak, 233.4M read from disk, 508.1M written to disk.
+-- Boot 92062c6e1bfb461c92e69af8c3bc6804 --
+Jan 12 12:19:04 launchpad systemd[1]: Starting Server for local large language models...
+Jan 12 12:19:04 launchpad systemd[1]: Started Server for local large language models.
+Jan 12 12:19:04 launchpad ollama[1519]: 2025/01/12 12:19:04 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 12 12:19:04 launchpad ollama[1519]: time=2025-01-12T12:19:04.322-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 12 12:19:04 launchpad ollama[1519]: time=2025-01-12T12:19:04.328-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 12 12:19:04 launchpad ollama[1519]: time=2025-01-12T12:19:04.329-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 12 12:19:04 launchpad ollama[1519]: time=2025-01-12T12:19:04.331-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2431135854/runners
+Jan 12 12:19:07 launchpad ollama[1519]: time=2025-01-12T12:19:07.157-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 12 12:19:07 launchpad ollama[1519]: time=2025-01-12T12:19:07.157-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 12 12:19:07 launchpad ollama[1519]: time=2025-01-12T12:19:07.157-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:19:07 launchpad ollama[1519]: time=2025-01-12T12:19:07.158-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:19:07 launchpad ollama[1519]: time=2025-01-12T12:19:07.158-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:19:08 launchpad ollama[1519]: time=2025-01-12T12:19:08.873-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Jan 12 12:19:53 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 12 12:19:53 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 12 12:19:53 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 12 12:19:53 launchpad systemd[1]: ollama.service: Consumed 5.135s CPU time, 787M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot c1318a295cf74419a0ed2c3b8704e973 --
+Jan 12 12:20:48 launchpad systemd[1]: Starting Server for local large language models...
+Jan 12 12:20:48 launchpad systemd[1]: Started Server for local large language models.
+Jan 12 12:20:48 launchpad ollama[1519]: 2025/01/12 12:20:48 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 12 12:20:48 launchpad ollama[1519]: time=2025-01-12T12:20:48.278-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 12 12:20:48 launchpad ollama[1519]: time=2025-01-12T12:20:48.285-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 12 12:20:48 launchpad ollama[1519]: time=2025-01-12T12:20:48.287-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 12 12:20:48 launchpad ollama[1519]: time=2025-01-12T12:20:48.287-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4073200553/runners
+Jan 12 12:20:51 launchpad ollama[1519]: time=2025-01-12T12:20:51.380-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Jan 12 12:20:51 launchpad ollama[1519]: time=2025-01-12T12:20:51.380-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 12 12:20:51 launchpad ollama[1519]: time=2025-01-12T12:20:51.381-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:20:51 launchpad ollama[1519]: time=2025-01-12T12:20:51.381-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:20:51 launchpad ollama[1519]: time=2025-01-12T12:20:51.381-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:20:51 launchpad ollama[1519]: time=2025-01-12T12:20:51.599-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 12 13:23:41 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 12 13:23:42 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 12 13:23:42 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 12 13:23:42 launchpad systemd[1]: ollama.service: Consumed 3.574s CPU time, 787.3M memory peak, 233.9M read from disk, 508.1M written to disk.
+-- Boot a2f47ad6412e4d348d67fd5c54faed9c --
+Jan 12 13:24:16 launchpad systemd[1]: Starting Server for local large language models...
+Jan 12 13:24:16 launchpad systemd[1]: Started Server for local large language models.
+Jan 12 13:24:16 launchpad ollama[1524]: 2025/01/12 13:24:16 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 12 13:24:16 launchpad ollama[1524]: time=2025-01-12T13:24:16.651-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 12 13:24:16 launchpad ollama[1524]: time=2025-01-12T13:24:16.658-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 12 13:24:16 launchpad ollama[1524]: time=2025-01-12T13:24:16.660-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 12 13:24:16 launchpad ollama[1524]: time=2025-01-12T13:24:16.661-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama384067037/runners
+Jan 12 13:24:19 launchpad ollama[1524]: time=2025-01-12T13:24:19.755-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 12 13:24:19 launchpad ollama[1524]: time=2025-01-12T13:24:19.755-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 12 13:24:19 launchpad ollama[1524]: time=2025-01-12T13:24:19.756-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 13:24:19 launchpad ollama[1524]: time=2025-01-12T13:24:19.756-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 13:24:19 launchpad ollama[1524]: time=2025-01-12T13:24:19.756-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 13:24:19 launchpad ollama[1524]: time=2025-01-12T13:24:19.983-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 19 11:07:44 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:07:44 | 200 |     820.573µs |       127.0.0.1 | HEAD     "/"
+Jan 19 11:07:44 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:07:44 | 200 |   16.168757ms |       127.0.0.1 | POST     "/api/show"
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.743-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10062856192 required="6.2 GiB"
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.743-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.744-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.745-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45051"
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.746-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.746-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.746-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 19 11:07:44 launchpad ollama[165551]: INFO [main] build info | build=0 commit="unknown" tid="140213763547136" timestamp=1737313664
+Jan 19 11:07:44 launchpad ollama[165551]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140213763547136" timestamp=1737313664 total_threads=16
+Jan 19 11:07:44 launchpad ollama[165551]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45051" tid="140213763547136" timestamp=1737313664
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.997-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 19 11:07:45 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 19 11:07:45 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 19 11:07:45 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 19 11:07:45 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 19 11:07:50 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 19 11:07:50 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 19 11:07:50 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 19 11:07:50 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 19 11:07:50 launchpad ollama[165551]: INFO [main] model loaded | tid="140213763547136" timestamp=1737313670
+Jan 19 11:07:50 launchpad ollama[1524]: time=2025-01-19T11:07:50.765-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.02 seconds"
+Jan 19 11:07:50 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:07:50 | 200 |  6.190084904s |       127.0.0.1 | POST     "/api/generate"
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.583-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10127933440 required="6.2 GiB"
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.583-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.583-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.584-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34491"
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.584-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.584-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.584-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 19 11:18:56 launchpad ollama[167260]: INFO [main] build info | build=0 commit="unknown" tid="140517491892224" timestamp=1737314336
+Jan 19 11:18:56 launchpad ollama[167260]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140517491892224" timestamp=1737314336 total_threads=16
+Jan 19 11:18:56 launchpad ollama[167260]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34491" tid="140517491892224" timestamp=1737314336
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.836-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 19 11:18:56 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 19 11:18:56 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 19 11:18:56 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 19 11:18:56 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 19 11:18:57 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 19 11:18:57 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 19 11:18:57 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 19 11:18:57 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 19 11:18:57 launchpad ollama[167260]: INFO [main] model loaded | tid="140517491892224" timestamp=1737314337
+Jan 19 11:18:57 launchpad ollama[1524]: time=2025-01-19T11:18:57.589-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Jan 19 11:19:04 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:19:04 | 200 |  8.000557174s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 11:23:41 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:23:41 | 200 |  4.999027767s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.067-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10127933440 required="6.2 GiB"
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.067-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.067-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.069-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43393"
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.069-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.069-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.069-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 19 11:29:50 launchpad ollama[168779]: INFO [main] build info | build=0 commit="unknown" tid="140039068495872" timestamp=1737314990
+Jan 19 11:29:50 launchpad ollama[168779]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140039068495872" timestamp=1737314990 total_threads=16
+Jan 19 11:29:50 launchpad ollama[168779]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43393" tid="140039068495872" timestamp=1737314990
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.320-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 19 11:29:50 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 19 11:29:50 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 19 11:29:50 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 19 11:29:50 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 19 11:29:50 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 19 11:29:51 launchpad ollama[168779]: INFO [main] model loaded | tid="140039068495872" timestamp=1737314991
+Jan 19 11:29:51 launchpad ollama[1524]: time=2025-01-19T11:29:51.073-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Jan 19 11:30:00 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:30:00 | 200 | 10.605879085s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 11:34:43 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:34:43 | 200 |  7.165432862s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 11:39:18 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:39:18 | 200 |  4.485852787s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 11:49:12 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:49:12 | 200 |      15.965µs |       127.0.0.1 | HEAD     "/"
+Jan 19 11:49:12 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:49:12 | 200 |    7.696369ms |       127.0.0.1 | POST     "/api/show"
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.172-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.325-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9993977856 required="9.2 GiB"
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.325-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.325-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.327-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 45113"
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.327-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.327-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.327-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 19 11:49:12 launchpad ollama[171728]: INFO [main] build info | build=0 commit="unknown" tid="140310543175680" timestamp=1737316152
+Jan 19 11:49:12 launchpad ollama[171728]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140310543175680" timestamp=1737316152 total_threads=16
+Jan 19 11:49:12 launchpad ollama[171728]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45113" tid="140310543175680" timestamp=1737316152
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - type  f32:   81 tensors
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - type q4_0:  281 tensors
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 3
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V2
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: vocab type       = SPM
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 32016
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 16384
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 5120
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 40
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_head           = 40
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 40
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 1
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 13824
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 1000000.0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: model type       = 13B
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: model params     = 13.02 B
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: general.name     = codellama
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 1 ''
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 2 ''
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: UNK token        = 0 ''
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: SUF token        = 32008 '▁'
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: MID token        = 32009 '▁'
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 32010 '▁'
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: max token length = 48
+Jan 19 11:49:12 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 19 11:49:12 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 19 11:49:12 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 19 11:49:12 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.578-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 19 11:49:19 launchpad ollama[1524]: llm_load_tensors: offloading 40 repeating layers to GPU
+Jan 19 11:49:19 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 19 11:49:19 launchpad ollama[1524]: llm_load_tensors: offloaded 41/41 layers to GPU
+Jan 19 11:49:19 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Jan 19 11:49:19 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 2048
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 1000000.0
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 19 11:49:20 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1286
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 19 11:49:20 launchpad ollama[171728]: INFO [main] model loaded | tid="140310543175680" timestamp=1737316160
+Jan 19 11:49:20 launchpad ollama[1524]: time=2025-01-19T11:49:20.851-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.52 seconds"
+Jan 19 11:49:20 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:49:20 | 200 |  8.682301556s |       127.0.0.1 | POST     "/api/generate"
+Jan 19 11:50:10 launchpad ollama[1524]: time=2025-01-19T11:50:10.653-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 19 11:50:23 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:50:23 | 200 | 13.275953621s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 11:52:24 launchpad ollama[1524]: time=2025-01-19T11:52:24.845-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 19 11:52:27 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:52:27 | 200 |  2.336261304s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 12:16:49 launchpad ollama[1524]: [GIN] 2025/01/19 - 12:16:49 | 200 |       15.87µs |       127.0.0.1 | HEAD     "/"
+Jan 19 12:16:49 launchpad ollama[1524]: [GIN] 2025/01/19 - 12:16:49 | 200 |   12.606595ms |       127.0.0.1 | POST     "/api/show"
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.750-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9968549888 required="6.2 GiB"
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.750-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.750-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.752-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38815"
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.752-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.752-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.752-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 19 12:16:49 launchpad ollama[175994]: INFO [main] build info | build=0 commit="unknown" tid="140526793773056" timestamp=1737317809
+Jan 19 12:16:49 launchpad ollama[175994]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140526793773056" timestamp=1737317809 total_threads=16
+Jan 19 12:16:49 launchpad ollama[175994]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38815" tid="140526793773056" timestamp=1737317809
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 19 12:16:49 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 19 12:16:50 launchpad ollama[1524]: time=2025-01-19T12:16:50.003-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 19 12:16:50 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 19 12:16:50 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 19 12:16:50 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 19 12:16:50 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 19 12:16:50 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 19 12:16:50 launchpad ollama[175994]: INFO [main] model loaded | tid="140526793773056" timestamp=1737317810
+Jan 19 12:16:50 launchpad ollama[1524]: time=2025-01-19T12:16:50.755-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Jan 19 12:16:50 launchpad ollama[1524]: [GIN] 2025/01/19 - 12:16:50 | 200 |   1.17554757s |       127.0.0.1 | POST     "/api/generate"
+Jan 19 12:19:01 launchpad ollama[1524]: [GIN] 2025/01/19 - 12:19:01 | 200 |  313.152684ms |       127.0.0.1 | POST     "/api/chat"
+Jan 19 12:20:16 launchpad ollama[1524]: [GIN] 2025/01/19 - 12:20:16 | 200 |  6.505974129s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 12:24:28 launchpad ollama[1524]: [GIN] 2025/01/19 - 12:24:28 | 200 |  538.875114ms |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:32:50 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:32:50 | 200 |      17.302µs |       127.0.0.1 | HEAD     "/"
+Jan 20 16:32:50 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:32:50 | 200 |   12.480656ms |       127.0.0.1 | POST     "/api/show"
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.044-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9720692736 required="6.2 GiB"
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.044-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.044-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.046-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34745"
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.046-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.046-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.046-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 20 16:32:51 launchpad ollama[300296]: INFO [main] build info | build=0 commit="unknown" tid="140111006040064" timestamp=1737419571
+Jan 20 16:32:51 launchpad ollama[300296]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140111006040064" timestamp=1737419571 total_threads=16
+Jan 20 16:32:51 launchpad ollama[300296]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34745" tid="140111006040064" timestamp=1737419571
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.297-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 20 16:32:51 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 20 16:32:51 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 20 16:32:51 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 20 16:32:51 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 20 16:32:51 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 20 16:32:52 launchpad ollama[300296]: INFO [main] model loaded | tid="140111006040064" timestamp=1737419572
+Jan 20 16:32:52 launchpad ollama[1524]: time=2025-01-20T16:32:52.051-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Jan 20 16:32:52 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:32:52 | 200 |   1.17586167s |       127.0.0.1 | POST     "/api/generate"
+Jan 20 16:35:58 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:35:58 | 200 |  6.241917128s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:40:34 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:40:34 | 200 |  4.398668688s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:41:03 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:41:03 | 200 |  2.152900794s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:42:57 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:42:57 | 200 |  2.608396964s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:44:47 launchpad ollama[300296]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1388 n_keep=24 n_left=2024 n_shift=1012 tid="140111006040064" timestamp=1737420287
+Jan 20 16:44:54 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:44:54 | 200 |  6.853014312s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:46:43 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:46:43 | 200 |  2.143516567s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:50:00 launchpad ollama[300296]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1012 n_keep=24 n_left=2024 n_shift=1012 tid="140111006040064" timestamp=1737420600
+Jan 20 16:50:02 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:50:02 | 200 |  2.454058788s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:52:31 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:52:31 | 200 |  3.326791285s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:54:15 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:54:15 | 200 |  4.370832569s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:55:38 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:55:38 | 200 |  1.904041014s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:56:10 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:56:10 | 200 |  1.902319388s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:56:42 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:56:42 | 200 |   2.27399481s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:58:50 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:58:50 | 200 |  3.696276704s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:00:53 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:00:53 | 200 |  5.035283596s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:02:18 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:02:18 | 200 |  5.681649967s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:03:59 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:03:59 | 200 |  3.657473484s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:05:45 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:05:45 | 200 |   5.40724379s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:08:35 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:08:35 | 200 |  5.686849969s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:09:56 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:09:56 | 200 |    5.3097907s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:11:34 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:11:34 | 200 |  5.785954266s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:13:56 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:13:56 | 200 |  5.040395188s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:15:20 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:15:20 | 200 |  4.661986849s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:17:27 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:17:27 | 200 |  6.319957332s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:18:20 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:18:20 | 200 |  5.275236009s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:22:08 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:22:08 | 200 |  4.188055837s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:25:16 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:25:16 | 200 |  9.162616578s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:27:24 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:27:24 | 200 |  7.075603632s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:30:07 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:30:07 | 200 |  6.661916832s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:33:17 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:33:17 | 200 |  6.645453646s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:35:35 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:35:35 | 200 |  5.772994211s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:38:54 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:38:54 | 200 |  6.114248872s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:40:18 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:40:18 | 200 |  4.767655427s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:44:51 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:44:51 | 200 |  7.625060365s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.031-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9912516608 required="6.2 GiB"
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.031-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.032-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.033-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33273"
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.033-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.033-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.033-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 20 18:19:48 launchpad ollama[316038]: INFO [main] build info | build=0 commit="unknown" tid="140283534491648" timestamp=1737425988
+Jan 20 18:19:48 launchpad ollama[316038]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140283534491648" timestamp=1737425988 total_threads=16
+Jan 20 18:19:48 launchpad ollama[316038]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33273" tid="140283534491648" timestamp=1737425988
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.284-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 20 18:19:48 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 20 18:19:48 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 20 18:19:48 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 20 18:19:48 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 20 18:19:48 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 20 18:19:49 launchpad ollama[316038]: INFO [main] model loaded | tid="140283534491648" timestamp=1737425989
+Jan 20 18:19:49 launchpad ollama[1524]: time=2025-01-20T18:19:49.037-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Jan 20 18:19:55 launchpad ollama[1524]: [GIN] 2025/01/20 - 18:19:55 | 200 |  8.056079776s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 18:21:09 launchpad ollama[316038]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="140283534491648" timestamp=1737426069
+Jan 20 18:21:13 launchpad ollama[1524]: [GIN] 2025/01/20 - 18:21:13 | 200 |  4.941487318s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 18:21:58 launchpad ollama[316038]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="140283534491648" timestamp=1737426118
+Jan 20 18:22:02 launchpad ollama[1524]: [GIN] 2025/01/20 - 18:22:02 | 200 |  4.487143889s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 18:23:07 launchpad ollama[1524]: [GIN] 2025/01/20 - 18:23:07 | 200 |  7.047716353s |       127.0.0.1 | POST     "/api/chat"
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.004-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9858056192 required="6.2 GiB"
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.004-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.004-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.005-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46803"
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.005-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.005-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.005-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 22 10:41:59 launchpad ollama[330552]: INFO [main] build info | build=0 commit="unknown" tid="140409680281600" timestamp=1737571319
+Jan 22 10:41:59 launchpad ollama[330552]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140409680281600" timestamp=1737571319 total_threads=16
+Jan 22 10:41:59 launchpad ollama[330552]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46803" tid="140409680281600" timestamp=1737571319
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.256-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 22 10:41:59 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 22 10:41:59 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 22 10:41:59 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 22 10:41:59 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 22 10:41:59 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 22 10:41:59 launchpad ollama[330552]: INFO [main] model loaded | tid="140409680281600" timestamp=1737571319
+Jan 22 10:42:00 launchpad ollama[1524]: time=2025-01-22T10:42:00.009-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Jan 22 10:42:05 launchpad ollama[1524]: [GIN] 2025/01/22 - 10:42:05 | 200 |  6.501525919s |       127.0.0.1 | POST     "/api/chat"
+Jan 28 08:29:33 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 28 08:29:33 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 28 08:29:33 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 28 08:29:34 launchpad systemd[1]: ollama.service: Consumed 4min 7.156s CPU time, 12.4G memory peak, 11.4G read from disk, 508.1M written to disk, 9.8M incoming IP traffic, 9.5M outgoing IP traffic.
+-- Boot db9ee19690e0445f98acd4e27cfa66df --
+Jan 28 08:30:06 launchpad systemd[1]: Starting Server for local large language models...
+Jan 28 08:30:06 launchpad systemd[1]: Started Server for local large language models.
+Jan 28 08:30:06 launchpad ollama[1526]: 2025/01/28 08:30:06 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 28 08:30:06 launchpad ollama[1526]: time=2025-01-28T08:30:06.439-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 28 08:30:06 launchpad ollama[1526]: time=2025-01-28T08:30:06.444-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 28 08:30:06 launchpad ollama[1526]: time=2025-01-28T08:30:06.446-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 28 08:30:06 launchpad ollama[1526]: time=2025-01-28T08:30:06.447-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3066321188/runners
+Jan 28 08:30:09 launchpad ollama[1526]: time=2025-01-28T08:30:09.390-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 28 08:30:09 launchpad ollama[1526]: time=2025-01-28T08:30:09.391-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 28 08:30:09 launchpad ollama[1526]: time=2025-01-28T08:30:09.391-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 28 08:30:09 launchpad ollama[1526]: time=2025-01-28T08:30:09.391-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 28 08:30:09 launchpad ollama[1526]: time=2025-01-28T08:30:09.391-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 28 08:30:09 launchpad ollama[1526]: time=2025-01-28T08:30:09.630-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 29 09:51:03 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 29 09:51:03 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 29 09:51:03 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 29 09:51:03 launchpad systemd[1]: ollama.service: Consumed 3.768s CPU time, 786.7M memory peak, 233.5M read from disk, 508.1M written to disk.
+-- Boot 452893ba9f734cdb848d42607b4ad7b8 --
+Jan 29 09:51:39 launchpad systemd[1]: Starting Server for local large language models...
+Jan 29 09:51:39 launchpad systemd[1]: Started Server for local large language models.
+Jan 29 09:51:39 launchpad ollama[1542]: 2025/01/29 09:51:39 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 29 09:51:39 launchpad ollama[1542]: time=2025-01-29T09:51:39.538-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 29 09:51:39 launchpad ollama[1542]: time=2025-01-29T09:51:39.546-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 29 09:51:39 launchpad ollama[1542]: time=2025-01-29T09:51:39.548-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 29 09:51:39 launchpad ollama[1542]: time=2025-01-29T09:51:39.550-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1578890084/runners
+Jan 29 09:51:42 launchpad ollama[1542]: time=2025-01-29T09:51:42.729-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 29 09:51:42 launchpad ollama[1542]: time=2025-01-29T09:51:42.729-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 29 09:51:42 launchpad ollama[1542]: time=2025-01-29T09:51:42.730-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 29 09:51:42 launchpad ollama[1542]: time=2025-01-29T09:51:42.730-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 29 09:51:42 launchpad ollama[1542]: time=2025-01-29T09:51:42.730-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 29 09:51:42 launchpad ollama[1542]: time=2025-01-29T09:51:42.936-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 01 09:20:48 launchpad ollama[1542]: [GIN] 2025/02/01 - 09:20:48 | 200 |     782.496µs |       127.0.0.1 | HEAD     "/"
+Feb 01 09:20:48 launchpad ollama[1542]: [GIN] 2025/02/01 - 09:20:48 | 200 |    6.904482ms |       127.0.0.1 | POST     "/api/show"
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.070-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.241-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10052370432 required="9.2 GiB"
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.241-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.242-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.243-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1578890084/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 34129"
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.244-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.244-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.244-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 01 09:20:48 launchpad ollama[323448]: INFO [main] build info | build=0 commit="unknown" tid="139747224600576" timestamp=1738430448
+Feb 01 09:20:48 launchpad ollama[323448]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139747224600576" timestamp=1738430448 total_threads=16
+Feb 01 09:20:48 launchpad ollama[323448]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34129" tid="139747224600576" timestamp=1738430448
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - type  f32:   81 tensors
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - type q4_0:  281 tensors
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - type q6_K:    1 tensors
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_vocab: special tokens cache size = 3
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: format           = GGUF V2
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: arch             = llama
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: vocab type       = SPM
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_vocab          = 32016
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_merges         = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: vocab_only       = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_ctx_train      = 16384
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_embd           = 5120
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_layer          = 40
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_head           = 40
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_head_kv        = 40
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_rot            = 128
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_swa            = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_gqa            = 1
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_ff             = 13824
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_expert         = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_expert_used    = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: causal attn      = 1
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: pooling type     = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: rope type        = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: rope scaling     = linear
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: freq_base_train  = 1000000.0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: freq_scale_train = 1
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: ssm_d_state      = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: model type       = 13B
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: model ftype      = Q4_0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: model params     = 13.02 B
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: general.name     = codellama
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: BOS token        = 1 ''
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: EOS token        = 2 ''
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: UNK token        = 0 ''
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: SUF token        = 32008 '▁'
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: MID token        = 32009 '▁'
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: EOT token        = 32010 '▁'
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: max token length = 48
+Feb 01 09:20:48 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 01 09:20:48 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 01 09:20:48 launchpad ollama[1542]: ggml_cuda_init: found 1 CUDA devices:
+Feb 01 09:20:48 launchpad ollama[1542]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.495-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Feb 01 09:21:19 launchpad ollama[1542]: llm_load_tensors: offloading 40 repeating layers to GPU
+Feb 01 09:21:19 launchpad ollama[1542]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 01 09:21:19 launchpad ollama[1542]: llm_load_tensors: offloaded 41/41 layers to GPU
+Feb 01 09:21:19 launchpad ollama[1542]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Feb 01 09:21:19 launchpad ollama[1542]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: n_ctx      = 2048
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: n_batch    = 512
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: n_ubatch   = 512
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: flash_attn = 0
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: freq_base  = 1000000.0
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: freq_scale = 1
+Feb 01 09:21:20 launchpad ollama[1542]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: graph nodes  = 1286
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: graph splits = 2
+Feb 01 09:21:20 launchpad ollama[323448]: INFO [main] model loaded | tid="139747224600576" timestamp=1738430480
+Feb 01 09:21:20 launchpad ollama[1542]: time=2025-02-01T09:21:20.835-08:00 level=INFO source=server.go:626 msg="llama runner started in 32.59 seconds"
+Feb 01 09:21:20 launchpad ollama[1542]: [GIN] 2025/02/01 - 09:21:20 | 200 | 32.768836167s |       127.0.0.1 | POST     "/api/generate"
+Feb 01 09:26:20 launchpad ollama[1542]: time=2025-02-01T09:26:20.183-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 01 09:26:31 launchpad ollama[1542]: [GIN] 2025/02/01 - 09:26:31 | 200 | 11.029840079s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 10:39:18 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:39:18 | 200 |      21.515µs |       127.0.0.1 | HEAD     "/"
+Feb 01 10:39:18 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:39:18 | 200 |   18.915561ms |       127.0.0.1 | POST     "/api/show"
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.250-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9942007808 required="6.2 GiB"
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.251-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.251-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.252-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1578890084/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34429"
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.252-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.252-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.253-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 01 10:39:18 launchpad ollama[336212]: INFO [main] build info | build=0 commit="unknown" tid="139697006374912" timestamp=1738435158
+Feb 01 10:39:18 launchpad ollama[336212]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139697006374912" timestamp=1738435158 total_threads=16
+Feb 01 10:39:18 launchpad ollama[336212]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34429" tid="139697006374912" timestamp=1738435158
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - type  f32:   65 tensors
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - type q4_0:  225 tensors
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - type q6_K:    1 tensors
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_vocab: special tokens cache size = 256
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.504-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: arch             = llama
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: vocab type       = BPE
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_vocab          = 128256
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_merges         = 280147
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: vocab_only       = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_embd           = 4096
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_layer          = 32
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_head           = 32
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_head_kv        = 8
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_rot            = 128
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_swa            = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_gqa            = 4
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_ff             = 14336
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_expert         = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_expert_used    = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: causal attn      = 1
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: pooling type     = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: rope type        = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: rope scaling     = linear
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: freq_scale_train = 1
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: ssm_d_state      = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: model type       = 8B
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: model ftype      = Q4_0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: model params     = 8.03 B
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: max token length = 256
+Feb 01 10:39:18 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 01 10:39:18 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 01 10:39:18 launchpad ollama[1542]: ggml_cuda_init: found 1 CUDA devices:
+Feb 01 10:39:18 launchpad ollama[1542]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 01 10:39:23 launchpad ollama[1542]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 01 10:39:23 launchpad ollama[1542]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 01 10:39:23 launchpad ollama[1542]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 01 10:39:23 launchpad ollama[1542]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 01 10:39:23 launchpad ollama[1542]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: n_ctx      = 8192
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: n_batch    = 512
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: n_ubatch   = 512
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: flash_attn = 0
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: freq_scale = 1
+Feb 01 10:39:24 launchpad ollama[1542]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: graph nodes  = 1030
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: graph splits = 2
+Feb 01 10:39:24 launchpad ollama[336212]: INFO [main] model loaded | tid="139697006374912" timestamp=1738435164
+Feb 01 10:39:24 launchpad ollama[1542]: time=2025-02-01T10:39:24.271-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.02 seconds"
+Feb 01 10:39:24 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:39:24 | 200 |  6.194738756s |       127.0.0.1 | POST     "/api/generate"
+Feb 01 10:40:25 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:40:25 | 200 |  7.787552389s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 10:42:36 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:42:36 | 200 |  9.552961921s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 10:47:09 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:47:09 | 200 |  8.634577406s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 10:49:14 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:49:14 | 200 |  4.965052598s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 10:52:17 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:52:17 | 200 |  7.209837514s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 10:54:43 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:54:43 | 200 |  8.020455569s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.991-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9975562240 required="6.2 GiB"
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.991-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.992-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.993-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1578890084/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38471"
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.993-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.993-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.993-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 01 11:10:53 launchpad ollama[340855]: INFO [main] build info | build=0 commit="unknown" tid="139931340832768" timestamp=1738437053
+Feb 01 11:10:53 launchpad ollama[340855]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139931340832768" timestamp=1738437053 total_threads=16
+Feb 01 11:10:53 launchpad ollama[340855]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38471" tid="139931340832768" timestamp=1738437053
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - type  f32:   65 tensors
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - type q4_0:  225 tensors
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - type q6_K:    1 tensors
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_vocab: special tokens cache size = 256
+Feb 01 11:10:53 launchpad ollama[1542]: time=2025-02-01T11:10:53.244-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: arch             = llama
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: vocab type       = BPE
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_vocab          = 128256
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_merges         = 280147
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: vocab_only       = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_embd           = 4096
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_layer          = 32
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_head           = 32
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_head_kv        = 8
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_rot            = 128
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_swa            = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_gqa            = 4
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_ff             = 14336
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_expert         = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_expert_used    = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: causal attn      = 1
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: pooling type     = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: rope type        = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: rope scaling     = linear
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: freq_scale_train = 1
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: ssm_d_state      = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: model type       = 8B
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: model ftype      = Q4_0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: model params     = 8.03 B
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: max token length = 256
+Feb 01 11:10:53 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 01 11:10:53 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 01 11:10:53 launchpad ollama[1542]: ggml_cuda_init: found 1 CUDA devices:
+Feb 01 11:10:53 launchpad ollama[1542]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: n_ctx      = 8192
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: n_batch    = 512
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: n_ubatch   = 512
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: flash_attn = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: freq_scale = 1
+Feb 01 11:10:53 launchpad ollama[1542]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: graph nodes  = 1030
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: graph splits = 2
+Feb 01 11:10:53 launchpad ollama[340855]: INFO [main] model loaded | tid="139931340832768" timestamp=1738437053
+Feb 01 11:10:53 launchpad ollama[1542]: time=2025-02-01T11:10:53.997-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 01 11:11:04 launchpad ollama[1542]: [GIN] 2025/02/01 - 11:11:04 | 200 |  11.78116143s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 15:38:29 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:38:29 | 200 |      15.521µs |       127.0.0.1 | HEAD     "/"
+Feb 01 15:38:30 launchpad ollama[1542]: time=2025-02-01T15:38:30.932-08:00 level=INFO source=download.go:175 msg="downloading 5ff0abeeac1d in 16 556 MB part(s)"
+Feb 01 15:44:43 launchpad ollama[1542]: time=2025-02-01T15:44:43.463-08:00 level=INFO source=download.go:175 msg="downloading 22091531faf0 in 1 705 B part(s)"
+Feb 01 15:44:44 launchpad ollama[1542]: time=2025-02-01T15:44:44.912-08:00 level=INFO source=download.go:175 msg="downloading 4bb71764481f in 1 13 KB part(s)"
+Feb 01 15:44:46 launchpad ollama[1542]: time=2025-02-01T15:44:46.178-08:00 level=INFO source=download.go:175 msg="downloading 1c8f573e830c in 1 1.1 KB part(s)"
+Feb 01 15:44:47 launchpad ollama[1542]: time=2025-02-01T15:44:47.491-08:00 level=INFO source=download.go:175 msg="downloading 19f2fb9e8bc6 in 1 32 B part(s)"
+Feb 01 15:44:48 launchpad ollama[1542]: time=2025-02-01T15:44:48.790-08:00 level=INFO source=download.go:175 msg="downloading 34488e453cfe in 1 568 B part(s)"
+Feb 01 15:44:55 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:44:55 | 200 |         6m25s |       127.0.0.1 | POST     "/api/pull"
+Feb 01 15:45:08 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:45:08 | 200 |      15.863µs |       127.0.0.1 | HEAD     "/"
+Feb 01 15:45:08 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:45:08 | 200 |    8.245202ms |       127.0.0.1 | POST     "/api/show"
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.038-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.038-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=28 layers.offload=26 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.6 GiB" memory.required.partial="9.1 GiB" memory.required.kv="540.0 MiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.5 GiB" memory.weights.repeating="8.4 GiB" memory.weights.nonrepeating="164.1 MiB" memory.graph.full="212.0 MiB" memory.graph.partial="376.1 MiB"
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.039-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1578890084/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046 --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 26 --parallel 1 --port 33197"
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.039-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.039-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.040-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 01 15:45:09 launchpad ollama[382206]: INFO [main] build info | build=0 commit="unknown" tid="140044897447936" timestamp=1738453509
+Feb 01 15:45:09 launchpad ollama[382206]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140044897447936" timestamp=1738453509 total_threads=16
+Feb 01 15:45:09 launchpad ollama[382206]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33197" tid="140044897447936" timestamp=1738453509
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: loaded meta data with 38 key-value pairs and 377 tensors from /var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046 (version GGUF V3 (latest))
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   0:                       general.architecture str              = deepseek2
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   1:                               general.name str              = DeepSeek-Coder-V2-Lite-Instruct
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   2:                      deepseek2.block_count u32              = 27
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   3:                   deepseek2.context_length u32              = 163840
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   4:                 deepseek2.embedding_length u32              = 2048
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   5:              deepseek2.feed_forward_length u32              = 10944
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   6:             deepseek2.attention.head_count u32              = 16
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   7:          deepseek2.attention.head_count_kv u32              = 16
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   8:                   deepseek2.rope.freq_base f32              = 10000.000000
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   9: deepseek2.attention.layer_norm_rms_epsilon f32              = 0.000001
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  10:                deepseek2.expert_used_count u32              = 6
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  12:        deepseek2.leading_dense_block_count u32              = 1
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  13:                       deepseek2.vocab_size u32              = 102400
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  14:           deepseek2.attention.kv_lora_rank u32              = 512
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  15:             deepseek2.attention.key_length u32              = 192
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  16:           deepseek2.attention.value_length u32              = 128
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  17:       deepseek2.expert_feed_forward_length u32              = 1408
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  18:                     deepseek2.expert_count u32              = 64
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  19:              deepseek2.expert_shared_count u32              = 2
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  20:             deepseek2.expert_weights_scale f32              = 1.000000
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  21:             deepseek2.rope.dimension_count u32              = 64
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  22:                deepseek2.rope.scaling.type str              = yarn
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  23:              deepseek2.rope.scaling.factor f32              = 40.000000
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  24: deepseek2.rope.scaling.original_context_length u32              = 4096
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  25: deepseek2.rope.scaling.yarn_log_multiplier f32              = 0.070700
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  26:                       tokenizer.ggml.model str              = gpt2
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  27:                         tokenizer.ggml.pre str              = deepseek-llm
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  28:                      tokenizer.ggml.tokens arr[str,102400]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  29:                  tokenizer.ggml.token_type arr[i32,102400]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  30:                      tokenizer.ggml.merges arr[str,99757]   = ["Ġ Ġ", "Ġ t", "Ġ a", "i n", "h e...
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 100000
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  32:                tokenizer.ggml.eos_token_id u32              = 100001
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  33:            tokenizer.ggml.padding_token_id u32              = 100001
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  34:               tokenizer.ggml.add_bos_token bool             = true
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  35:               tokenizer.ggml.add_eos_token bool             = false
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  36:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  37:               general.quantization_version u32              = 2
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - type  f32:  108 tensors
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - type q4_0:  268 tensors
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - type q6_K:    1 tensors
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_vocab: special tokens cache size = 2400
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_vocab: token to piece cache size = 0.6661 MB
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: arch             = deepseek2
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: vocab type       = BPE
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_vocab          = 102400
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_merges         = 99757
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: vocab_only       = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_ctx_train      = 163840
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_embd           = 2048
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_layer          = 27
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_head           = 16
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_head_kv        = 16
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_rot            = 64
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_swa            = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_k    = 192
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_gqa            = 1
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_embd_k_gqa     = 3072
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_embd_v_gqa     = 2048
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-06
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_ff             = 10944
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_expert         = 64
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_expert_used    = 6
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: causal attn      = 1
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: pooling type     = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: rope type        = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: rope scaling     = yarn
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: freq_base_train  = 10000.0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: freq_scale_train = 0.025
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_ctx_orig_yarn  = 4096
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: ssm_d_state      = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: model type       = 16B
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: model ftype      = Q4_0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: model params     = 15.71 B
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: model size       = 8.29 GiB (4.53 BPW)
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: general.name     = DeepSeek-Coder-V2-Lite-Instruct
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: BOS token        = 100000 '<|begin▁of▁sentence|>'
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: EOS token        = 100001 '<|end▁of▁sentence|>'
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: PAD token        = 100001 '<|end▁of▁sentence|>'
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: LF token         = 126 'Ä'
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: max token length = 256
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_layer_dense_lead   = 1
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_lora_q             = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_lora_kv            = 512
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_ff_exp             = 1408
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_expert_shared      = 2
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: expert_weights_scale = 1.0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: rope_yarn_log_mul    = 0.0707
+Feb 01 15:45:09 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 01 15:45:09 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 01 15:45:09 launchpad ollama[1542]: ggml_cuda_init: found 1 CUDA devices:
+Feb 01 15:45:09 launchpad ollama[1542]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_tensors: ggml ctx size =    0.32 MiB
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.485-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_tensors: offloading 26 repeating layers to GPU
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_tensors: offloaded 26/28 layers to GPU
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_tensors:        CPU buffer size =  2222.30 MiB
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_tensors:      CUDA0 buffer size =  8168.73 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: n_ctx      = 2048
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: n_batch    = 512
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: n_ubatch   = 512
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: flash_attn = 0
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: freq_base  = 10000.0
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: freq_scale = 0.025
+Feb 01 15:45:10 launchpad ollama[1542]: llama_kv_cache_init:  CUDA_Host KV buffer size =    20.00 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_kv_cache_init:      CUDA0 KV buffer size =   520.00 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: KV self size  =  540.00 MiB, K (f16):  324.00 MiB, V (f16):  216.00 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.40 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model:      CUDA0 compute buffer size =   376.06 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: graph nodes  = 1924
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: graph splits = 16
+Feb 01 15:45:10 launchpad ollama[382206]: INFO [main] model loaded | tid="140044897447936" timestamp=1738453510
+Feb 01 15:45:10 launchpad ollama[1542]: time=2025-02-01T15:45:10.740-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.70 seconds"
+Feb 01 15:45:10 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:45:10 | 200 |  1.868792545s |       127.0.0.1 | POST     "/api/generate"
+Feb 01 15:45:42 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 15:45:43 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:45:43 | 200 |  1.492885919s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 15:47:11 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 15:47:15 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:47:15 | 200 |  4.354633179s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 15:49:58 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 15:50:04 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:50:04 | 200 |  6.751007515s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 15:51:31 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 15:51:42 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:51:42 | 200 | 11.108623634s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 15:54:26 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 15:54:38 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:54:38 | 200 | 12.249143915s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 15:58:10 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 15:58:18 launchpad ollama[1542]: /build/source/llm/llama.cpp/src/llama.cpp:16940: Deepseek2 does not support K-shift
+Feb 01 15:58:21 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:58:21 | 200 | 11.903695652s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 16:03:26 launchpad ollama[1542]: time=2025-02-01T16:03:26.639-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.138885157 model=/var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046
+Feb 01 16:03:26 launchpad ollama[1542]: time=2025-02-01T16:03:26.890-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.389402549 model=/var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046
+Feb 01 16:03:27 launchpad ollama[1542]: time=2025-02-01T16:03:27.140-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.639748416 model=/var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046
+Feb 01 16:32:33 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:32:33 | 200 |      15.859µs |       127.0.0.1 | HEAD     "/"
+Feb 01 16:32:33 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:32:33 | 200 |    8.054865ms |       127.0.0.1 | POST     "/api/show"
+Feb 01 16:32:33 launchpad ollama[1542]: time=2025-02-01T16:32:33.893-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Feb 01 16:32:33 launchpad ollama[1542]: time=2025-02-01T16:32:33.893-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=28 layers.offload=26 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.6 GiB" memory.required.partial="9.1 GiB" memory.required.kv="540.0 MiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.5 GiB" memory.weights.repeating="8.4 GiB" memory.weights.nonrepeating="164.1 MiB" memory.graph.full="212.0 MiB" memory.graph.partial="376.1 MiB"
+Feb 01 16:32:33 launchpad ollama[1542]: time=2025-02-01T16:32:33.894-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1578890084/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046 --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 26 --parallel 1 --port 44777"
+Feb 01 16:32:33 launchpad ollama[1542]: time=2025-02-01T16:32:33.894-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 01 16:32:33 launchpad ollama[1542]: time=2025-02-01T16:32:33.894-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 01 16:32:33 launchpad ollama[1542]: time=2025-02-01T16:32:33.894-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 01 16:32:33 launchpad ollama[420139]: INFO [main] build info | build=0 commit="unknown" tid="140476942221312" timestamp=1738456353
+Feb 01 16:32:33 launchpad ollama[420139]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140476942221312" timestamp=1738456353 total_threads=16
+Feb 01 16:32:33 launchpad ollama[420139]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44777" tid="140476942221312" timestamp=1738456353
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: loaded meta data with 38 key-value pairs and 377 tensors from /var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046 (version GGUF V3 (latest))
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   0:                       general.architecture str              = deepseek2
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   1:                               general.name str              = DeepSeek-Coder-V2-Lite-Instruct
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   2:                      deepseek2.block_count u32              = 27
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   3:                   deepseek2.context_length u32              = 163840
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   4:                 deepseek2.embedding_length u32              = 2048
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   5:              deepseek2.feed_forward_length u32              = 10944
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   6:             deepseek2.attention.head_count u32              = 16
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   7:          deepseek2.attention.head_count_kv u32              = 16
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   8:                   deepseek2.rope.freq_base f32              = 10000.000000
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   9: deepseek2.attention.layer_norm_rms_epsilon f32              = 0.000001
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  10:                deepseek2.expert_used_count u32              = 6
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  12:        deepseek2.leading_dense_block_count u32              = 1
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  13:                       deepseek2.vocab_size u32              = 102400
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  14:           deepseek2.attention.kv_lora_rank u32              = 512
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  15:             deepseek2.attention.key_length u32              = 192
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  16:           deepseek2.attention.value_length u32              = 128
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  17:       deepseek2.expert_feed_forward_length u32              = 1408
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  18:                     deepseek2.expert_count u32              = 64
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  19:              deepseek2.expert_shared_count u32              = 2
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  20:             deepseek2.expert_weights_scale f32              = 1.000000
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  21:             deepseek2.rope.dimension_count u32              = 64
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  22:                deepseek2.rope.scaling.type str              = yarn
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  23:              deepseek2.rope.scaling.factor f32              = 40.000000
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  24: deepseek2.rope.scaling.original_context_length u32              = 4096
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  25: deepseek2.rope.scaling.yarn_log_multiplier f32              = 0.070700
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  26:                       tokenizer.ggml.model str              = gpt2
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  27:                         tokenizer.ggml.pre str              = deepseek-llm
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  28:                      tokenizer.ggml.tokens arr[str,102400]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  29:                  tokenizer.ggml.token_type arr[i32,102400]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  30:                      tokenizer.ggml.merges arr[str,99757]   = ["Ġ Ġ", "Ġ t", "Ġ a", "i n", "h e...
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 100000
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  32:                tokenizer.ggml.eos_token_id u32              = 100001
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  33:            tokenizer.ggml.padding_token_id u32              = 100001
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  34:               tokenizer.ggml.add_bos_token bool             = true
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  35:               tokenizer.ggml.add_eos_token bool             = false
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  36:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  37:               general.quantization_version u32              = 2
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - type  f32:  108 tensors
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - type q4_0:  268 tensors
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - type q6_K:    1 tensors
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_vocab: special tokens cache size = 2400
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_vocab: token to piece cache size = 0.6661 MB
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: arch             = deepseek2
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: vocab type       = BPE
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_vocab          = 102400
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_merges         = 99757
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: vocab_only       = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_ctx_train      = 163840
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_embd           = 2048
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_layer          = 27
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_head           = 16
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_head_kv        = 16
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_rot            = 64
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_swa            = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_k    = 192
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_gqa            = 1
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_embd_k_gqa     = 3072
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_embd_v_gqa     = 2048
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-06
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_ff             = 10944
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_expert         = 64
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_expert_used    = 6
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: causal attn      = 1
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: pooling type     = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: rope type        = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: rope scaling     = yarn
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: freq_base_train  = 10000.0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: freq_scale_train = 0.025
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_ctx_orig_yarn  = 4096
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: ssm_d_state      = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: model type       = 16B
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: model ftype      = Q4_0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: model params     = 15.71 B
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: model size       = 8.29 GiB (4.53 BPW)
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: general.name     = DeepSeek-Coder-V2-Lite-Instruct
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: BOS token        = 100000 '<|begin▁of▁sentence|>'
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: EOS token        = 100001 '<|end▁of▁sentence|>'
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: PAD token        = 100001 '<|end▁of▁sentence|>'
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: LF token         = 126 'Ä'
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: max token length = 256
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_layer_dense_lead   = 1
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_lora_q             = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_lora_kv            = 512
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_ff_exp             = 1408
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_expert_shared      = 2
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: expert_weights_scale = 1.0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: rope_yarn_log_mul    = 0.0707
+Feb 01 16:32:34 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 01 16:32:34 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 01 16:32:34 launchpad ollama[1542]: ggml_cuda_init: found 1 CUDA devices:
+Feb 01 16:32:34 launchpad ollama[1542]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_tensors: ggml ctx size =    0.32 MiB
+Feb 01 16:32:34 launchpad ollama[1542]: time=2025-02-01T16:32:34.346-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server not responding"
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_tensors: offloading 26 repeating layers to GPU
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_tensors: offloaded 26/28 layers to GPU
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_tensors:        CPU buffer size =  2222.30 MiB
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_tensors:      CUDA0 buffer size =  8168.73 MiB
+Feb 01 16:32:34 launchpad ollama[1542]: time=2025-02-01T16:32:34.598-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: n_ctx      = 2048
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: n_batch    = 512
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: n_ubatch   = 512
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: flash_attn = 0
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: freq_base  = 10000.0
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: freq_scale = 0.025
+Feb 01 16:32:35 launchpad ollama[1542]: llama_kv_cache_init:  CUDA_Host KV buffer size =    20.00 MiB
+Feb 01 16:32:35 launchpad ollama[1542]: llama_kv_cache_init:      CUDA0 KV buffer size =   520.00 MiB
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: KV self size  =  540.00 MiB, K (f16):  324.00 MiB, V (f16):  216.00 MiB
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.40 MiB
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model:      CUDA0 compute buffer size =   376.06 MiB
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: graph nodes  = 1924
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: graph splits = 16
+Feb 01 16:32:35 launchpad ollama[420139]: INFO [main] model loaded | tid="140476942221312" timestamp=1738456355
+Feb 01 16:32:35 launchpad ollama[1542]: time=2025-02-01T16:32:35.601-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.71 seconds"
+Feb 01 16:32:35 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:32:35 | 200 |  1.876938441s |       127.0.0.1 | POST     "/api/generate"
+Feb 01 16:32:50 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 16:33:00 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:33:00 | 200 | 10.085431345s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 16:37:37 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 16:37:48 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:37:48 | 200 | 11.100581045s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 16:38:42 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 16:38:54 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:38:54 | 200 | 11.827158041s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 16:41:14 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 16:41:20 launchpad ollama[1542]: /build/source/llm/llama.cpp/src/llama.cpp:16940: Deepseek2 does not support K-shift
+Feb 01 16:41:23 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:41:23 | 200 |  9.357358628s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 16:46:28 launchpad ollama[1542]: time=2025-02-01T16:46:28.886-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.13673044 model=/var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046
+Feb 01 16:46:29 launchpad ollama[1542]: time=2025-02-01T16:46:29.135-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.385882612 model=/var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046
+Feb 01 16:46:29 launchpad ollama[1542]: time=2025-02-01T16:46:29.385-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.63581397 model=/var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046
+Feb 04 17:44:07 launchpad ollama[1542]: [GIN] 2025/02/04 - 17:44:07 | 200 |      17.932µs |       127.0.0.1 | HEAD     "/"
+Feb 04 17:44:07 launchpad ollama[1542]: [GIN] 2025/02/04 - 17:44:07 | 200 |      50.919µs |       127.0.0.1 | GET      "/api/ps"
+Feb 04 17:44:19 launchpad ollama[1542]: [GIN] 2025/02/04 - 17:44:19 | 200 |      15.157µs |       127.0.0.1 | HEAD     "/"
+Feb 04 17:44:19 launchpad ollama[1542]: [GIN] 2025/02/04 - 17:44:19 | 200 |    2.326055ms |       127.0.0.1 | GET      "/api/tags"
+Feb 07 12:14:12 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 07 12:14:12 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 07 12:14:12 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 07 12:14:12 launchpad systemd[1]: ollama.service: Consumed 10min 5.028s CPU time, 21G memory peak, 11.5G read from disk, 8.7G written to disk, 8.4G incoming IP traffic, 153.8M outgoing IP traffic.
+-- Boot 318701f7efd84065b4710ac959eb8901 --
+Feb 07 12:14:47 launchpad systemd[1]: Starting Server for local large language models...
+Feb 07 12:14:47 launchpad systemd[1]: Started Server for local large language models.
+Feb 07 12:14:47 launchpad ollama[1560]: 2025/02/07 12:14:47 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 07 12:14:47 launchpad ollama[1560]: time=2025-02-07T12:14:47.526-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 07 12:14:47 launchpad ollama[1560]: time=2025-02-07T12:14:47.532-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 07 12:14:47 launchpad ollama[1560]: time=2025-02-07T12:14:47.533-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 07 12:14:47 launchpad ollama[1560]: time=2025-02-07T12:14:47.535-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2560886655/runners
+Feb 07 12:14:50 launchpad ollama[1560]: time=2025-02-07T12:14:50.442-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 07 12:14:50 launchpad ollama[1560]: time=2025-02-07T12:14:50.443-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 07 12:14:50 launchpad ollama[1560]: time=2025-02-07T12:14:50.443-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 07 12:14:50 launchpad ollama[1560]: time=2025-02-07T12:14:50.443-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 07 12:14:50 launchpad ollama[1560]: time=2025-02-07T12:14:50.443-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 07 12:14:50 launchpad ollama[1560]: time=2025-02-07T12:14:50.657-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 12 15:30:56 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:30:56 | 200 |     809.745µs |       127.0.0.1 | HEAD     "/"
+Feb 12 15:30:56 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:30:56 | 200 |   16.704582ms |       127.0.0.1 | POST     "/api/show"
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.119-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8824487936 required="6.2 GiB"
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.119-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.8 GiB" free_swap="68.9 GiB"
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.119-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.121-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44055"
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.121-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.121-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.121-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 15:30:57 launchpad ollama[317666]: INFO [main] build info | build=0 commit="unknown" tid="140249299701760" timestamp=1739403057
+Feb 12 15:30:57 launchpad ollama[317666]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140249299701760" timestamp=1739403057 total_threads=16
+Feb 12 15:30:57 launchpad ollama[317666]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44055" tid="140249299701760" timestamp=1739403057
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.372-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 15:30:57 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 15:30:57 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 15:30:57 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 15:30:57 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 15:31:02 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 15:31:02 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 15:31:02 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 15:31:02 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 15:31:02 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 15:31:03 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 15:31:03 launchpad ollama[317666]: INFO [main] model loaded | tid="140249299701760" timestamp=1739403063
+Feb 12 15:31:03 launchpad ollama[1560]: time=2025-02-12T15:31:03.388-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Feb 12 15:31:03 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:31:03 | 200 |   6.45295186s |       127.0.0.1 | POST     "/api/generate"
+Feb 12 15:31:10 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:31:10 | 200 |  806.499181ms |       127.0.0.1 | POST     "/api/chat"
+Feb 12 15:34:37 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:34:37 | 200 |  7.558818056s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 15:38:54 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:38:54 | 200 |   4.47735231s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.543-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8826585088 required="6.2 GiB"
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.543-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.8 GiB" free_swap="68.9 GiB"
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.543-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.544-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43755"
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.544-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.544-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.544-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 15:46:00 launchpad ollama[319731]: INFO [main] build info | build=0 commit="unknown" tid="140564619010048" timestamp=1739403960
+Feb 12 15:46:00 launchpad ollama[319731]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140564619010048" timestamp=1739403960 total_threads=16
+Feb 12 15:46:00 launchpad ollama[319731]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43755" tid="140564619010048" timestamp=1739403960
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.795-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 15:46:00 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 15:46:00 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 15:46:00 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 15:46:00 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 15:46:01 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 15:46:01 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 15:46:01 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 15:46:01 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 15:46:01 launchpad ollama[319731]: INFO [main] model loaded | tid="140564619010048" timestamp=1739403961
+Feb 12 15:46:01 launchpad ollama[1560]: time=2025-02-12T15:46:01.548-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 15:46:11 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:46:11 | 200 | 11.600758256s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.078-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8836022272 required="6.2 GiB"
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.078-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.078-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.079-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46595"
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.079-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.079-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.079-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 15:56:59 launchpad ollama[321250]: INFO [main] build info | build=0 commit="unknown" tid="140409782231040" timestamp=1739404619
+Feb 12 15:56:59 launchpad ollama[321250]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140409782231040" timestamp=1739404619 total_threads=16
+Feb 12 15:56:59 launchpad ollama[321250]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46595" tid="140409782231040" timestamp=1739404619
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.330-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 15:56:59 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 15:56:59 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 15:56:59 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 15:56:59 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 15:56:59 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 15:56:59 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 15:56:59 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 15:56:59 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 15:56:59 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 15:57:00 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 15:57:00 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 15:57:00 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 15:57:00 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 15:57:00 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 15:57:00 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 15:57:00 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 15:57:00 launchpad ollama[321250]: INFO [main] model loaded | tid="140409782231040" timestamp=1739404620
+Feb 12 15:57:00 launchpad ollama[1560]: time=2025-02-12T15:57:00.083-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 15:57:13 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:57:13 | 200 | 14.578612736s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.627-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8836022272 required="6.2 GiB"
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.627-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.627-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.628-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42613"
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.628-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.628-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.629-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 16:07:16 launchpad ollama[322706]: INFO [main] build info | build=0 commit="unknown" tid="139939873157120" timestamp=1739405236
+Feb 12 16:07:16 launchpad ollama[322706]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139939873157120" timestamp=1739405236 total_threads=16
+Feb 12 16:07:16 launchpad ollama[322706]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42613" tid="139939873157120" timestamp=1739405236
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.879-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 16:07:16 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 16:07:16 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 16:07:16 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 16:07:16 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 16:07:17 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 16:07:17 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 16:07:17 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 16:07:17 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 16:07:17 launchpad ollama[322706]: INFO [main] model loaded | tid="139939873157120" timestamp=1739405237
+Feb 12 16:07:17 launchpad ollama[1560]: time=2025-02-12T16:07:17.632-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 16:07:27 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:07:27 | 200 | 10.713501567s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.812-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8789884928 required="6.2 GiB"
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.812-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.812-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.813-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33111"
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.813-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.813-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.814-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 16:16:54 launchpad ollama[324056]: INFO [main] build info | build=0 commit="unknown" tid="140372847271936" timestamp=1739405814
+Feb 12 16:16:54 launchpad ollama[324056]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140372847271936" timestamp=1739405814 total_threads=16
+Feb 12 16:16:54 launchpad ollama[324056]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33111" tid="140372847271936" timestamp=1739405814
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 16:16:55 launchpad ollama[1560]: time=2025-02-12T16:16:55.065-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 16:16:55 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 16:16:55 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 16:16:55 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 16:16:55 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 16:16:55 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 16:16:55 launchpad ollama[324056]: INFO [main] model loaded | tid="140372847271936" timestamp=1739405815
+Feb 12 16:16:55 launchpad ollama[1560]: time=2025-02-12T16:16:55.817-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 16:17:05 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:17:05 | 200 |  10.89453299s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:19:35 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:19:35 | 200 | 10.435771224s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.045-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8789884928 required="6.2 GiB"
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.045-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.046-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.047-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44181"
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.047-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.047-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.047-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 16:30:44 launchpad ollama[325948]: INFO [main] build info | build=0 commit="unknown" tid="139989269700608" timestamp=1739406644
+Feb 12 16:30:44 launchpad ollama[325948]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139989269700608" timestamp=1739406644 total_threads=16
+Feb 12 16:30:44 launchpad ollama[325948]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44181" tid="139989269700608" timestamp=1739406644
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.298-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 16:30:44 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 16:30:44 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 16:30:44 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 16:30:44 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 16:30:44 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 16:30:45 launchpad ollama[325948]: INFO [main] model loaded | tid="139989269700608" timestamp=1739406645
+Feb 12 16:30:45 launchpad ollama[1560]: time=2025-02-12T16:30:45.051-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 16:30:53 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:30:53 | 200 |  9.370615161s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:34:50 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:34:50 | 200 | 10.811598588s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.246-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8661434368 required="6.2 GiB"
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.246-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.247-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.247-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45783"
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.248-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.248-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.248-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 16:53:07 launchpad ollama[329337]: INFO [main] build info | build=0 commit="unknown" tid="139917100216320" timestamp=1739407987
+Feb 12 16:53:07 launchpad ollama[329337]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139917100216320" timestamp=1739407987 total_threads=16
+Feb 12 16:53:07 launchpad ollama[329337]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45783" tid="139917100216320" timestamp=1739407987
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.499-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 16:53:07 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 16:53:07 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 16:53:07 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 16:53:07 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 16:53:08 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 16:53:08 launchpad ollama[329337]: INFO [main] model loaded | tid="139917100216320" timestamp=1739407988
+Feb 12 16:53:08 launchpad ollama[1560]: time=2025-02-12T16:53:08.251-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 16:53:15 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:53:15 | 200 |  8.481043146s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:56:44 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:56:44 | 200 |  7.666530946s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.154-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8619229184 required="6.2 GiB"
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.154-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.5 GiB" free_swap="68.9 GiB"
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.154-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.155-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40083"
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.156-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.156-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.156-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 17:08:19 launchpad ollama[331594]: INFO [main] build info | build=0 commit="unknown" tid="140119408218112" timestamp=1739408899
+Feb 12 17:08:19 launchpad ollama[331594]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140119408218112" timestamp=1739408899 total_threads=16
+Feb 12 17:08:19 launchpad ollama[331594]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40083" tid="140119408218112" timestamp=1739408899
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.407-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 17:08:19 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 17:08:19 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 17:08:19 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 17:08:19 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 17:08:20 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 17:08:20 launchpad ollama[331594]: INFO [main] model loaded | tid="140119408218112" timestamp=1739408900
+Feb 12 17:08:20 launchpad ollama[1560]: time=2025-02-12T17:08:20.160-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 17:08:27 launchpad ollama[1560]: [GIN] 2025/02/12 - 17:08:27 | 200 |  8.778505461s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 17:11:34 launchpad ollama[1560]: [GIN] 2025/02/12 - 17:11:34 | 200 |  8.016645904s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.166-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8814526464 required="6.2 GiB"
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.166-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.167-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.167-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43207"
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.168-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.168-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.168-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 17:18:39 launchpad ollama[333230]: INFO [main] build info | build=0 commit="unknown" tid="140570797301760" timestamp=1739409519
+Feb 12 17:18:39 launchpad ollama[333230]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140570797301760" timestamp=1739409519 total_threads=16
+Feb 12 17:18:39 launchpad ollama[333230]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43207" tid="140570797301760" timestamp=1739409519
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.419-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 17:18:39 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 17:18:39 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 17:18:39 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 17:18:39 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 17:18:40 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 17:18:40 launchpad ollama[333230]: INFO [main] model loaded | tid="140570797301760" timestamp=1739409520
+Feb 12 17:18:40 launchpad ollama[1560]: time=2025-02-12T17:18:40.172-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 17:18:45 launchpad ollama[1560]: [GIN] 2025/02/12 - 17:18:45 | 200 |  6.309499227s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 17:20:51 launchpad ollama[1560]: [GIN] 2025/02/12 - 17:20:51 | 200 |  2.663260947s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 17:23:00 launchpad ollama[1560]: [GIN] 2025/02/12 - 17:23:00 | 200 |  4.274803171s |       127.0.0.1 | POST     "/api/chat"
+-- Boot 1bc40fd30e944d92a892538849018274 --
+Feb 13 08:19:27 launchpad systemd[1]: Starting Server for local large language models...
+Feb 13 08:19:27 launchpad systemd[1]: Started Server for local large language models.
+Feb 13 08:19:27 launchpad ollama[1566]: 2025/02/13 08:19:27 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 13 08:19:27 launchpad ollama[1566]: time=2025-02-13T08:19:27.325-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 13 08:19:27 launchpad ollama[1566]: time=2025-02-13T08:19:27.331-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 13 08:19:27 launchpad ollama[1566]: time=2025-02-13T08:19:27.333-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 13 08:19:27 launchpad ollama[1566]: time=2025-02-13T08:19:27.333-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama245345955/runners
+Feb 13 08:19:30 launchpad ollama[1566]: time=2025-02-13T08:19:30.275-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 13 08:19:30 launchpad ollama[1566]: time=2025-02-13T08:19:30.276-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 13 08:19:30 launchpad ollama[1566]: time=2025-02-13T08:19:30.276-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 13 08:19:30 launchpad ollama[1566]: time=2025-02-13T08:19:30.276-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 13 08:19:30 launchpad ollama[1566]: time=2025-02-13T08:19:30.276-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 13 08:19:30 launchpad ollama[1566]: time=2025-02-13T08:19:30.489-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.1 GiB"
+Feb 15 11:52:37 launchpad ollama[1566]: [GIN] 2025/02/15 - 11:52:37 | 200 |     760.424µs |       127.0.0.1 | HEAD     "/"
+Feb 15 11:52:37 launchpad ollama[1566]: [GIN] 2025/02/15 - 11:52:37 | 200 |    7.228879ms |       127.0.0.1 | POST     "/api/show"
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.529-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.666-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.667-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.668-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama245345955/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 35099"
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.668-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.668-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.668-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 15 11:52:37 launchpad ollama[174071]: INFO [main] build info | build=0 commit="unknown" tid="140665951301632" timestamp=1739649157
+Feb 15 11:52:37 launchpad ollama[174071]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140665951301632" timestamp=1739649157 total_threads=16
+Feb 15 11:52:37 launchpad ollama[174071]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35099" tid="140665951301632" timestamp=1739649157
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - type  f32:   81 tensors
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - type q4_0:  281 tensors
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - type q6_K:    1 tensors
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_vocab: special tokens cache size = 3
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: format           = GGUF V2
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: arch             = llama
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: vocab type       = SPM
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_vocab          = 32016
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_merges         = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: vocab_only       = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_ctx_train      = 16384
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_embd           = 5120
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_layer          = 40
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_head           = 40
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_head_kv        = 40
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_rot            = 128
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_swa            = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_gqa            = 1
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_ff             = 13824
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_expert         = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_expert_used    = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: causal attn      = 1
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: pooling type     = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: rope type        = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: rope scaling     = linear
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: freq_base_train  = 1000000.0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: freq_scale_train = 1
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: ssm_d_state      = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: model type       = 13B
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: model ftype      = Q4_0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: model params     = 13.02 B
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: general.name     = codellama
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: BOS token        = 1 ''
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: EOS token        = 2 ''
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: UNK token        = 0 ''
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: SUF token        = 32008 '▁'
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: MID token        = 32009 '▁'
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: EOT token        = 32010 '▁'
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: max token length = 48
+Feb 15 11:52:37 launchpad ollama[1566]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 15 11:52:37 launchpad ollama[1566]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 15 11:52:37 launchpad ollama[1566]: ggml_cuda_init: found 1 CUDA devices:
+Feb 15 11:52:37 launchpad ollama[1566]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.919-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Feb 15 11:52:45 launchpad ollama[1566]: llm_load_tensors: offloading 39 repeating layers to GPU
+Feb 15 11:52:45 launchpad ollama[1566]: llm_load_tensors: offloaded 39/41 layers to GPU
+Feb 15 11:52:45 launchpad ollama[1566]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Feb 15 11:52:45 launchpad ollama[1566]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: n_ctx      = 2048
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: n_batch    = 512
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: n_ubatch   = 512
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: flash_attn = 0
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: freq_base  = 1000000.0
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: freq_scale = 1
+Feb 15 11:52:46 launchpad ollama[1566]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: graph nodes  = 1286
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: graph splits = 15
+Feb 15 11:52:46 launchpad ollama[174071]: INFO [main] model loaded | tid="140665951301632" timestamp=1739649166
+Feb 15 11:52:46 launchpad ollama[1566]: time=2025-02-15T11:52:46.442-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.77 seconds"
+Feb 15 11:52:46 launchpad ollama[1566]: [GIN] 2025/02/15 - 11:52:46 | 200 |  8.916506088s |       127.0.0.1 | POST     "/api/generate"
+Feb 15 11:52:54 launchpad ollama[1566]: time=2025-02-15T11:52:54.584-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 11:52:56 launchpad ollama[1566]: [GIN] 2025/02/15 - 11:52:56 | 200 |  1.995319181s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 11:53:41 launchpad ollama[1566]: time=2025-02-15T11:53:41.925-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 11:53:52 launchpad ollama[1566]: [GIN] 2025/02/15 - 11:53:52 | 200 | 10.953237822s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 11:58:46 launchpad ollama[1566]: time=2025-02-15T11:58:46.009-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 11:59:16 launchpad ollama[1566]: [GIN] 2025/02/15 - 11:59:16 | 200 | 30.673746077s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 12:01:31 launchpad ollama[1566]: time=2025-02-15T12:01:31.324-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 12:01:45 launchpad ollama[1566]: [GIN] 2025/02/15 - 12:01:45 | 200 | 14.095009496s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.284-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.441-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.442-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.443-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama245345955/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 35217"
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.443-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.443-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.443-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 15 12:11:29 launchpad ollama[195769]: INFO [main] build info | build=0 commit="unknown" tid="139763539496960" timestamp=1739650289
+Feb 15 12:11:29 launchpad ollama[195769]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139763539496960" timestamp=1739650289 total_threads=16
+Feb 15 12:11:29 launchpad ollama[195769]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35217" tid="139763539496960" timestamp=1739650289
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - type  f32:   81 tensors
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - type q4_0:  281 tensors
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - type q6_K:    1 tensors
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_vocab: special tokens cache size = 3
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: format           = GGUF V2
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: arch             = llama
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: vocab type       = SPM
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_vocab          = 32016
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_merges         = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: vocab_only       = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_ctx_train      = 16384
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_embd           = 5120
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_layer          = 40
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_head           = 40
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_head_kv        = 40
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_rot            = 128
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_swa            = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_gqa            = 1
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_ff             = 13824
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_expert         = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_expert_used    = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: causal attn      = 1
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: pooling type     = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: rope type        = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: rope scaling     = linear
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: freq_base_train  = 1000000.0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: freq_scale_train = 1
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: ssm_d_state      = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: model type       = 13B
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: model ftype      = Q4_0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: model params     = 13.02 B
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: general.name     = codellama
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: BOS token        = 1 ''
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: EOS token        = 2 ''
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: UNK token        = 0 ''
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: SUF token        = 32008 '▁'
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: MID token        = 32009 '▁'
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: EOT token        = 32010 '▁'
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: max token length = 48
+Feb 15 12:11:29 launchpad ollama[1566]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 15 12:11:29 launchpad ollama[1566]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 15 12:11:29 launchpad ollama[1566]: ggml_cuda_init: found 1 CUDA devices:
+Feb 15 12:11:29 launchpad ollama[1566]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.733-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_tensors: offloading 39 repeating layers to GPU
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_tensors: offloaded 39/41 layers to GPU
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: n_ctx      = 2048
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: n_batch    = 512
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: n_ubatch   = 512
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: flash_attn = 0
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: freq_base  = 1000000.0
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: freq_scale = 1
+Feb 15 12:11:30 launchpad ollama[1566]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: graph nodes  = 1286
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: graph splits = 15
+Feb 15 12:11:30 launchpad ollama[195769]: INFO [main] model loaded | tid="139763539496960" timestamp=1739650290
+Feb 15 12:11:30 launchpad ollama[1566]: time=2025-02-15T12:11:30.736-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Feb 15 12:11:37 launchpad ollama[1566]: [GIN] 2025/02/15 - 12:11:37 | 200 |  7.908151505s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 12:16:08 launchpad ollama[1566]: time=2025-02-15T12:16:08.876-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 12:16:30 launchpad ollama[1566]: [GIN] 2025/02/15 - 12:16:30 | 200 | 21.938492665s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.070-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.228-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.4 GiB" free_swap="68.9 GiB"
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.228-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.229-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama245345955/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 46093"
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.229-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.229-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.229-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 15 12:58:45 launchpad ollama[210632]: INFO [main] build info | build=0 commit="unknown" tid="140125316128768" timestamp=1739653125
+Feb 15 12:58:45 launchpad ollama[210632]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140125316128768" timestamp=1739653125 total_threads=16
+Feb 15 12:58:45 launchpad ollama[210632]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46093" tid="140125316128768" timestamp=1739653125
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - type  f32:   81 tensors
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - type q4_0:  281 tensors
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - type q6_K:    1 tensors
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_vocab: special tokens cache size = 3
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: format           = GGUF V2
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: arch             = llama
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: vocab type       = SPM
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_vocab          = 32016
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_merges         = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: vocab_only       = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_ctx_train      = 16384
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_embd           = 5120
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_layer          = 40
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_head           = 40
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_head_kv        = 40
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_rot            = 128
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_swa            = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_gqa            = 1
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_ff             = 13824
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_expert         = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_expert_used    = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: causal attn      = 1
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: pooling type     = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: rope type        = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: rope scaling     = linear
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: freq_base_train  = 1000000.0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: freq_scale_train = 1
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: ssm_d_state      = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: model type       = 13B
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: model ftype      = Q4_0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: model params     = 13.02 B
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: general.name     = codellama
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: BOS token        = 1 ''
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: EOS token        = 2 ''
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: UNK token        = 0 ''
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: SUF token        = 32008 '▁'
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: MID token        = 32009 '▁'
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: EOT token        = 32010 '▁'
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: max token length = 48
+Feb 15 12:58:45 launchpad ollama[1566]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 15 12:58:45 launchpad ollama[1566]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 15 12:58:45 launchpad ollama[1566]: ggml_cuda_init: found 1 CUDA devices:
+Feb 15 12:58:45 launchpad ollama[1566]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.524-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_tensors: offloading 39 repeating layers to GPU
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_tensors: offloaded 39/41 layers to GPU
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: n_ctx      = 2048
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: n_batch    = 512
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: n_ubatch   = 512
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: flash_attn = 0
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: freq_base  = 1000000.0
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: freq_scale = 1
+Feb 15 12:58:46 launchpad ollama[1566]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: graph nodes  = 1286
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: graph splits = 15
+Feb 15 12:58:46 launchpad ollama[210632]: INFO [main] model loaded | tid="140125316128768" timestamp=1739653126
+Feb 15 12:58:46 launchpad ollama[1566]: time=2025-02-15T12:58:46.527-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Feb 15 12:59:40 launchpad ollama[1566]: [GIN] 2025/02/15 - 12:59:40 | 200 | 55.188965858s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 13:01:42 launchpad ollama[1566]: time=2025-02-15T13:01:42.169-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 13:03:25 launchpad ollama[1566]: [GIN] 2025/02/15 - 13:03:25 | 200 |         1m43s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 13:53:02 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 15 13:53:02 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 15 13:53:02 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 15 13:53:02 launchpad systemd[1]: ollama.service: Consumed 12min 8.868s CPU time, 8G memory peak, 7.1G read from disk, 508.1M written to disk, 1.8M incoming IP traffic, 2.5M outgoing IP traffic.
+-- Boot e896c194ceef425f8e564ca391b26ad0 --
+Feb 15 13:53:08 launchpad systemd[1]: Starting Server for local large language models...
+Feb 15 13:53:08 launchpad systemd[1]: Started Server for local large language models.
+Feb 15 13:53:08 launchpad ollama[1533]: 2025/02/15 13:53:08 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 15 13:53:08 launchpad ollama[1533]: time=2025-02-15T13:53:08.945-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 15 13:53:08 launchpad ollama[1533]: time=2025-02-15T13:53:08.950-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 15 13:53:08 launchpad ollama[1533]: time=2025-02-15T13:53:08.953-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 15 13:53:08 launchpad ollama[1533]: time=2025-02-15T13:53:08.954-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama382441005/runners
+Feb 15 13:53:11 launchpad ollama[1533]: time=2025-02-15T13:53:11.943-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Feb 15 13:53:11 launchpad ollama[1533]: time=2025-02-15T13:53:11.944-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 15 13:53:11 launchpad ollama[1533]: time=2025-02-15T13:53:11.944-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 15 13:53:11 launchpad ollama[1533]: time=2025-02-15T13:53:11.944-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 15 13:53:11 launchpad ollama[1533]: time=2025-02-15T13:53:11.944-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 15 13:53:12 launchpad ollama[1533]: time=2025-02-15T13:53:12.141-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 15 18:09:05 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 15 18:09:05 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 15 18:09:05 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 15 18:09:05 launchpad systemd[1]: ollama.service: Consumed 3.408s CPU time, 787.6M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot b75c99985a0841129b7af6edd7fb4133 --
+Feb 16 04:02:17 launchpad systemd[1]: Starting Server for local large language models...
+Feb 16 04:02:17 launchpad systemd[1]: Started Server for local large language models.
+Feb 16 04:02:17 launchpad ollama[1540]: 2025/02/16 04:02:17 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 16 04:02:17 launchpad ollama[1540]: time=2025-02-16T04:02:17.372-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 16 04:02:17 launchpad ollama[1540]: time=2025-02-16T04:02:17.376-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 16 04:02:17 launchpad ollama[1540]: time=2025-02-16T04:02:17.378-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 16 04:02:17 launchpad ollama[1540]: time=2025-02-16T04:02:17.380-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1089523640/runners
+Feb 16 04:02:20 launchpad ollama[1540]: time=2025-02-16T04:02:20.273-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 16 04:02:20 launchpad ollama[1540]: time=2025-02-16T04:02:20.274-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 16 04:02:20 launchpad ollama[1540]: time=2025-02-16T04:02:20.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 16 04:02:20 launchpad ollama[1540]: time=2025-02-16T04:02:20.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 16 04:02:20 launchpad ollama[1540]: time=2025-02-16T04:02:20.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 16 04:02:20 launchpad ollama[1540]: time=2025-02-16T04:02:20.509-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 17 13:36:27 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:36:27 | 200 |     804.242µs |       127.0.0.1 | HEAD     "/"
+Feb 17 13:36:27 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:36:27 | 200 |   16.897737ms |       127.0.0.1 | POST     "/api/show"
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.971-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9833349120 required="6.2 GiB"
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.971-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.972-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.974-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34405"
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.974-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.974-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.974-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 13:36:28 launchpad ollama[99439]: INFO [main] build info | build=0 commit="unknown" tid="140422218096640" timestamp=1739828188
+Feb 17 13:36:28 launchpad ollama[99439]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140422218096640" timestamp=1739828188 total_threads=16
+Feb 17 13:36:28 launchpad ollama[99439]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34405" tid="140422218096640" timestamp=1739828188
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 13:36:28 launchpad ollama[1540]: time=2025-02-17T13:36:28.225-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 13:36:28 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 13:36:28 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 13:36:28 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 13:36:28 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 13:36:33 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 13:36:33 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 13:36:33 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 13:36:33 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 13:36:34 launchpad ollama[99439]: INFO [main] model loaded | tid="140422218096640" timestamp=1739828194
+Feb 17 13:36:34 launchpad ollama[1540]: time=2025-02-17T13:36:34.244-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Feb 17 13:36:34 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:36:34 | 200 |  6.447249478s |       127.0.0.1 | POST     "/api/generate"
+Feb 17 13:38:07 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:38:07 | 200 | 10.646925045s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 13:41:18 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:41:18 | 200 |  6.021633368s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.454-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9879945216 required="6.2 GiB"
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.454-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.455-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.456-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37795"
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.456-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.456-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.456-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 13:50:48 launchpad ollama[101447]: INFO [main] build info | build=0 commit="unknown" tid="140633737093120" timestamp=1739829048
+Feb 17 13:50:48 launchpad ollama[101447]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140633737093120" timestamp=1739829048 total_threads=16
+Feb 17 13:50:48 launchpad ollama[101447]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37795" tid="140633737093120" timestamp=1739829048
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.707-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 13:50:48 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 13:50:48 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 13:50:48 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 13:50:48 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 13:50:49 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 13:50:49 launchpad ollama[101447]: INFO [main] model loaded | tid="140633737093120" timestamp=1739829049
+Feb 17 13:50:49 launchpad ollama[1540]: time=2025-02-17T13:50:49.460-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 13:50:55 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:50:55 | 200 |  7.472184967s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 13:52:48 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:52:48 | 200 |  5.467955799s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.157-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9879945216 required="6.2 GiB"
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.157-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.157-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.158-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34253"
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.158-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.158-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.158-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 14:01:10 launchpad ollama[102929]: INFO [main] build info | build=0 commit="unknown" tid="139969446260736" timestamp=1739829670
+Feb 17 14:01:10 launchpad ollama[102929]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139969446260736" timestamp=1739829670 total_threads=16
+Feb 17 14:01:10 launchpad ollama[102929]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34253" tid="139969446260736" timestamp=1739829670
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.410-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 14:01:10 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 14:01:10 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 14:01:10 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 14:01:10 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 14:01:11 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 14:01:11 launchpad ollama[102929]: INFO [main] model loaded | tid="139969446260736" timestamp=1739829671
+Feb 17 14:01:11 launchpad ollama[1540]: time=2025-02-17T14:01:11.163-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 14:01:17 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:01:17 | 200 |  7.928378858s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:04:37 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:04:37 | 200 | 15.181517664s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:06:53 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:06:53 | 200 |  6.931962169s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:11:04 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:11:04 | 200 |  6.738335961s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.636-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9884205056 required="6.2 GiB"
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.636-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.636-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.637-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40167"
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.637-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.637-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.637-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 14:16:45 launchpad ollama[105090]: INFO [main] build info | build=0 commit="unknown" tid="139986082029568" timestamp=1739830605
+Feb 17 14:16:45 launchpad ollama[105090]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139986082029568" timestamp=1739830605 total_threads=16
+Feb 17 14:16:45 launchpad ollama[105090]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40167" tid="139986082029568" timestamp=1739830605
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.888-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 14:16:45 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 14:16:45 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 14:16:45 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 14:16:45 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 14:16:46 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 14:16:46 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 14:16:46 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 14:16:46 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 14:16:46 launchpad ollama[105090]: INFO [main] model loaded | tid="139986082029568" timestamp=1739830606
+Feb 17 14:16:46 launchpad ollama[1540]: time=2025-02-17T14:16:46.642-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 14:16:54 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:16:54 | 200 |  8.560344084s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:17:53 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:17:53 | 200 |  5.860552603s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.586-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9884205056 required="6.2 GiB"
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.586-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.587-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.588-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39049"
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.588-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.588-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.588-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 14:24:50 launchpad ollama[106236]: INFO [main] build info | build=0 commit="unknown" tid="140273477804032" timestamp=1739831090
+Feb 17 14:24:50 launchpad ollama[106236]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140273477804032" timestamp=1739831090 total_threads=16
+Feb 17 14:24:50 launchpad ollama[106236]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39049" tid="140273477804032" timestamp=1739831090
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.839-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 14:24:50 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 14:24:50 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 14:24:50 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 14:24:50 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 14:24:51 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 14:24:51 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 14:24:51 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 14:24:51 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 14:24:51 launchpad ollama[106236]: INFO [main] model loaded | tid="140273477804032" timestamp=1739831091
+Feb 17 14:24:51 launchpad ollama[1540]: time=2025-02-17T14:24:51.592-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 14:25:01 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:25:01 | 200 | 11.294123511s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:26:56 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:26:56 | 200 | 10.134133371s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.671-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9879945216 required="6.2 GiB"
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.671-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.671-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.672-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44533"
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.672-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.672-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.672-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 14:35:45 launchpad ollama[107804]: INFO [main] build info | build=0 commit="unknown" tid="140186630414336" timestamp=1739831745
+Feb 17 14:35:45 launchpad ollama[107804]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140186630414336" timestamp=1739831745 total_threads=16
+Feb 17 14:35:45 launchpad ollama[107804]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44533" tid="140186630414336" timestamp=1739831745
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.923-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 14:35:45 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 14:35:45 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 14:35:45 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 14:35:45 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 14:35:46 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 14:35:46 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 14:35:46 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 14:35:46 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 14:35:46 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 14:35:46 launchpad ollama[107804]: INFO [main] model loaded | tid="140186630414336" timestamp=1739831746
+Feb 17 14:35:46 launchpad ollama[1540]: time=2025-02-17T14:35:46.675-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 14:35:51 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:35:51 | 200 |   6.32260039s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:36:31 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:36:31 | 200 |    11.97683ms |       127.0.0.1 | POST     "/api/show"
+Feb 17 14:40:48 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:40:48 | 200 |   4.33500691s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:41:35 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:41:35 | 200 |   12.500274ms |       127.0.0.1 | POST     "/api/show"
+Feb 17 14:41:43 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:41:43 | 200 |   12.221463ms |       127.0.0.1 | POST     "/api/show"
+Feb 17 14:43:19 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:43:19 | 200 |  4.854734596s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:45:35 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:45:35 | 200 |   6.46467151s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:47:03 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:47:03 | 200 |  5.245958268s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.431-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9883090944 required="6.2 GiB"
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.431-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.431-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.432-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37441"
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.433-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.433-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.433-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 14:58:16 launchpad ollama[110943]: INFO [main] build info | build=0 commit="unknown" tid="140268142682112" timestamp=1739833096
+Feb 17 14:58:16 launchpad ollama[110943]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140268142682112" timestamp=1739833096 total_threads=16
+Feb 17 14:58:16 launchpad ollama[110943]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37441" tid="140268142682112" timestamp=1739833096
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.684-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 14:58:16 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 14:58:16 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 14:58:16 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 14:58:16 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 14:58:17 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 14:58:17 launchpad ollama[110943]: INFO [main] model loaded | tid="140268142682112" timestamp=1739833097
+Feb 17 14:58:17 launchpad ollama[1540]: time=2025-02-17T14:58:17.436-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 14:58:24 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:58:24 | 200 |   8.33625331s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 15:00:03 launchpad ollama[1540]: [GIN] 2025/02/17 - 15:00:03 | 200 |  6.886185006s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 15:03:25 launchpad ollama[1540]: [GIN] 2025/02/17 - 15:03:25 | 200 |  8.601641964s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.115-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9879945216 required="6.2 GiB"
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.115-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.115-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.116-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38177"
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.116-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.117-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.117-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 15:12:05 launchpad ollama[112866]: INFO [main] build info | build=0 commit="unknown" tid="140038500421632" timestamp=1739833925
+Feb 17 15:12:05 launchpad ollama[112866]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140038500421632" timestamp=1739833925 total_threads=16
+Feb 17 15:12:05 launchpad ollama[112866]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38177" tid="140038500421632" timestamp=1739833925
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.368-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 15:12:05 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 15:12:05 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 15:12:05 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 15:12:05 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 15:12:06 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 15:12:06 launchpad ollama[112866]: INFO [main] model loaded | tid="140038500421632" timestamp=1739833926
+Feb 17 15:12:06 launchpad ollama[1540]: time=2025-02-17T15:12:06.121-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 15:12:15 launchpad ollama[1540]: [GIN] 2025/02/17 - 15:12:15 | 200 |  10.59340036s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.487-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9883090944 required="6.2 GiB"
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.487-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.488-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.489-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41091"
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.489-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.489-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.489-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 15:20:38 launchpad ollama[114084]: INFO [main] build info | build=0 commit="unknown" tid="140332373102592" timestamp=1739834438
+Feb 17 15:20:38 launchpad ollama[114084]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140332373102592" timestamp=1739834438 total_threads=16
+Feb 17 15:20:38 launchpad ollama[114084]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41091" tid="140332373102592" timestamp=1739834438
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.740-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 15:20:38 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 15:20:38 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 15:20:38 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 15:20:38 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 15:20:39 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 15:20:39 launchpad ollama[114084]: INFO [main] model loaded | tid="140332373102592" timestamp=1739834439
+Feb 17 15:20:39 launchpad ollama[1540]: time=2025-02-17T15:20:39.492-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 15:20:47 launchpad ollama[1540]: [GIN] 2025/02/17 - 15:20:47 | 200 |  9.225234909s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 15:23:08 launchpad ollama[1540]: [GIN] 2025/02/17 - 15:23:08 | 200 | 10.996744755s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 15:23:40 launchpad ollama[1540]: [GIN] 2025/02/17 - 15:23:40 | 200 |  8.134205118s |       127.0.0.1 | POST     "/api/chat"
+Feb 21 16:18:22 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 21 16:18:23 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 21 16:18:23 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 21 16:18:23 launchpad systemd[1]: ollama.service: Consumed 3min 14.009s CPU time, 5.6G memory peak, 4.6G read from disk, 508.1M written to disk, 4.9M incoming IP traffic, 5.5M outgoing IP traffic.
+-- Boot 56a21e2208e1447e869457b88292f8e2 --
+Feb 21 16:19:16 launchpad systemd[1]: Starting Server for local large language models...
+Feb 21 16:19:16 launchpad systemd[1]: Started Server for local large language models.
+Feb 21 16:19:16 launchpad ollama[1539]: 2025/02/21 16:19:16 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 21 16:19:16 launchpad ollama[1539]: time=2025-02-21T16:19:16.227-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 21 16:19:16 launchpad ollama[1539]: time=2025-02-21T16:19:16.232-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 21 16:19:16 launchpad ollama[1539]: time=2025-02-21T16:19:16.235-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 21 16:19:16 launchpad ollama[1539]: time=2025-02-21T16:19:16.236-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1648657965/runners
+Feb 21 16:19:19 launchpad ollama[1539]: time=2025-02-21T16:19:19.116-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Feb 21 16:19:19 launchpad ollama[1539]: time=2025-02-21T16:19:19.117-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 21 16:19:19 launchpad ollama[1539]: time=2025-02-21T16:19:19.117-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 21 16:19:19 launchpad ollama[1539]: time=2025-02-21T16:19:19.117-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 21 16:19:19 launchpad ollama[1539]: time=2025-02-21T16:19:19.117-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 21 16:19:27 launchpad ollama[1539]: time=2025-02-21T16:19:27.054-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Feb 21 16:20:05 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 21 16:20:06 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 21 16:20:06 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 21 16:20:06 launchpad systemd[1]: ollama.service: Consumed 11.213s CPU time, 786.5M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot e12f595e26c445d49248c90c7f337992 --
+Feb 21 16:20:41 launchpad systemd[1]: Starting Server for local large language models...
+Feb 21 16:20:41 launchpad systemd[1]: Started Server for local large language models.
+Feb 21 16:20:41 launchpad ollama[1510]: 2025/02/21 16:20:41 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 21 16:20:41 launchpad ollama[1510]: time=2025-02-21T16:20:41.265-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 21 16:20:41 launchpad ollama[1510]: time=2025-02-21T16:20:41.269-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 21 16:20:41 launchpad ollama[1510]: time=2025-02-21T16:20:41.271-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 21 16:20:41 launchpad ollama[1510]: time=2025-02-21T16:20:41.272-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama141469161/runners
+Feb 21 16:20:44 launchpad ollama[1510]: time=2025-02-21T16:20:44.178-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 21 16:20:44 launchpad ollama[1510]: time=2025-02-21T16:20:44.179-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 21 16:20:44 launchpad ollama[1510]: time=2025-02-21T16:20:44.179-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 21 16:20:44 launchpad ollama[1510]: time=2025-02-21T16:20:44.180-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 21 16:20:44 launchpad ollama[1510]: time=2025-02-21T16:20:44.180-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 21 16:20:44 launchpad ollama[1510]: time=2025-02-21T16:20:44.400-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 22 15:10:36 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 15:10:36 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 15:10:36 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 15:10:36 launchpad systemd[1]: ollama.service: Consumed 3.725s CPU time, 786.4M memory peak, 234.3M read from disk, 508.1M written to disk.
+Feb 22 15:10:42 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 15:10:42 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 15:10:42 launchpad ollama[609219]: 2025/02/22 15:10:42 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 15:10:42 launchpad ollama[609219]: time=2025-02-22T15:10:42.546-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 15:10:42 launchpad ollama[609219]: time=2025-02-22T15:10:42.546-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 15:10:42 launchpad ollama[609219]: time=2025-02-22T15:10:42.547-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 15:10:42 launchpad ollama[609219]: time=2025-02-22T15:10:42.547-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1820237840/runners
+Feb 22 15:10:45 launchpad ollama[609219]: time=2025-02-22T15:10:45.649-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 22 15:10:45 launchpad ollama[609219]: time=2025-02-22T15:10:45.649-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 15:10:45 launchpad ollama[609219]: time=2025-02-22T15:10:45.649-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:10:45 launchpad ollama[609219]: time=2025-02-22T15:10:45.649-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:10:45 launchpad ollama[609219]: time=2025-02-22T15:10:45.649-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:10:45 launchpad ollama[609219]: time=2025-02-22T15:10:45.905-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="9.0 GiB"
+Feb 22 15:29:56 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 15:29:56 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 15:29:56 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 15:29:56 launchpad systemd[1]: ollama.service: Consumed 3.608s CPU time, 552.2M memory peak, 508.1M written to disk.
+-- Boot 2661aaee13a741189bfa7b24c1e81223 --
+Feb 22 15:30:33 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 15:30:33 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 15:30:33 launchpad ollama[1534]: 2025/02/22 15:30:33 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 15:30:33 launchpad ollama[1534]: time=2025-02-22T15:30:33.266-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 15:30:33 launchpad ollama[1534]: time=2025-02-22T15:30:33.271-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 15:30:33 launchpad ollama[1534]: time=2025-02-22T15:30:33.272-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 15:30:33 launchpad ollama[1534]: time=2025-02-22T15:30:33.275-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3061524593/runners
+Feb 22 15:30:36 launchpad ollama[1534]: time=2025-02-22T15:30:36.259-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 22 15:30:36 launchpad ollama[1534]: time=2025-02-22T15:30:36.259-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 15:30:36 launchpad ollama[1534]: time=2025-02-22T15:30:36.259-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:30:36 launchpad ollama[1534]: time=2025-02-22T15:30:36.259-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:30:36 launchpad ollama[1534]: time=2025-02-22T15:30:36.259-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:30:36 launchpad ollama[1534]: time=2025-02-22T15:30:36.441-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 22 15:35:33 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 15:35:33 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 15:35:33 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 15:35:33 launchpad systemd[1]: ollama.service: Consumed 3.404s CPU time, 787M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot bc1d123a60ee4d9eb1178bf3702c8b05 --
+Feb 22 15:36:05 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 15:36:05 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 15:36:06 launchpad ollama[1534]: 2025/02/22 15:36:06 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 15:36:06 launchpad ollama[1534]: time=2025-02-22T15:36:06.054-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 15:36:06 launchpad ollama[1534]: time=2025-02-22T15:36:06.059-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 15:36:06 launchpad ollama[1534]: time=2025-02-22T15:36:06.060-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 15:36:06 launchpad ollama[1534]: time=2025-02-22T15:36:06.061-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama504777951/runners
+Feb 22 15:36:09 launchpad ollama[1534]: time=2025-02-22T15:36:09.044-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Feb 22 15:36:09 launchpad ollama[1534]: time=2025-02-22T15:36:09.045-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 15:36:09 launchpad ollama[1534]: time=2025-02-22T15:36:09.045-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:36:09 launchpad ollama[1534]: time=2025-02-22T15:36:09.045-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:36:09 launchpad ollama[1534]: time=2025-02-22T15:36:09.045-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:36:09 launchpad ollama[1534]: time=2025-02-22T15:36:09.224-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 22 15:37:38 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 15:37:38 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 15:37:38 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 15:37:38 launchpad systemd[1]: ollama.service: Consumed 3.398s CPU time, 788.9M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 2e14837fa2344ee885c7a1f0d3779a60 --
+Feb 22 15:38:16 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 15:38:16 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 15:38:16 launchpad ollama[1531]: 2025/02/22 15:38:16 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 15:38:16 launchpad ollama[1531]: time=2025-02-22T15:38:16.410-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 15:38:16 launchpad ollama[1531]: time=2025-02-22T15:38:16.415-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 15:38:16 launchpad ollama[1531]: time=2025-02-22T15:38:16.416-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 15:38:16 launchpad ollama[1531]: time=2025-02-22T15:38:16.418-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1154028157/runners
+Feb 22 15:38:19 launchpad ollama[1531]: time=2025-02-22T15:38:19.302-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 22 15:38:19 launchpad ollama[1531]: time=2025-02-22T15:38:19.303-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 15:38:19 launchpad ollama[1531]: time=2025-02-22T15:38:19.303-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:38:19 launchpad ollama[1531]: time=2025-02-22T15:38:19.303-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:38:19 launchpad ollama[1531]: time=2025-02-22T15:38:19.303-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:38:21 launchpad ollama[1531]: time=2025-02-22T15:38:21.002-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Feb 22 15:38:39 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 15:38:39 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 15:38:39 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 15:38:39 launchpad systemd[1]: ollama.service: Consumed 5.157s CPU time, 787.5M memory peak, 234.6M read from disk, 508.1M written to disk.
+-- Boot 40e81e4e58c840da9f86fa8bdb940f2e --
+Feb 22 15:39:12 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 15:39:12 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 15:39:12 launchpad ollama[1529]: 2025/02/22 15:39:12 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 15:39:12 launchpad ollama[1529]: time=2025-02-22T15:39:12.201-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 15:39:12 launchpad ollama[1529]: time=2025-02-22T15:39:12.206-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 15:39:12 launchpad ollama[1529]: time=2025-02-22T15:39:12.206-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 15:39:12 launchpad ollama[1529]: time=2025-02-22T15:39:12.208-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1018485095/runners
+Feb 22 15:39:15 launchpad ollama[1529]: time=2025-02-22T15:39:15.159-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Feb 22 15:39:15 launchpad ollama[1529]: time=2025-02-22T15:39:15.160-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 15:39:15 launchpad ollama[1529]: time=2025-02-22T15:39:15.160-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:39:15 launchpad ollama[1529]: time=2025-02-22T15:39:15.161-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:39:15 launchpad ollama[1529]: time=2025-02-22T15:39:15.161-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:39:15 launchpad ollama[1529]: time=2025-02-22T15:39:15.368-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.0 GiB"
+Feb 22 17:16:05 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 17:16:05 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 17:16:05 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 17:16:05 launchpad systemd[1]: ollama.service: Consumed 3.434s CPU time, 787.6M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot f7192e1dfabf4cf78ceb4665c68773dc --
+Feb 22 17:16:36 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 17:16:37 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 17:16:37 launchpad ollama[1531]: 2025/02/22 17:16:37 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 17:16:37 launchpad ollama[1531]: time=2025-02-22T17:16:37.096-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 17:16:37 launchpad ollama[1531]: time=2025-02-22T17:16:37.101-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 17:16:37 launchpad ollama[1531]: time=2025-02-22T17:16:37.102-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 17:16:37 launchpad ollama[1531]: time=2025-02-22T17:16:37.103-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama779339256/runners
+Feb 22 17:16:40 launchpad ollama[1531]: time=2025-02-22T17:16:40.071-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Feb 22 17:16:40 launchpad ollama[1531]: time=2025-02-22T17:16:40.071-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 17:16:40 launchpad ollama[1531]: time=2025-02-22T17:16:40.072-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 17:16:40 launchpad ollama[1531]: time=2025-02-22T17:16:40.072-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 17:16:40 launchpad ollama[1531]: time=2025-02-22T17:16:40.072-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 17:16:40 launchpad ollama[1531]: time=2025-02-22T17:16:40.277-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Feb 22 17:30:21 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 17:30:21 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 17:30:21 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 17:30:21 launchpad systemd[1]: ollama.service: Consumed 3.403s CPU time, 788.5M memory peak, 234.8M read from disk, 508.1M written to disk.
+-- Boot a2e049ff56334ceebf2a3a5d90b82b24 --
+Feb 22 17:30:52 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 17:30:52 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 17:30:52 launchpad ollama[1537]: 2025/02/22 17:30:52 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 17:30:52 launchpad ollama[1537]: time=2025-02-22T17:30:52.849-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 17:30:52 launchpad ollama[1537]: time=2025-02-22T17:30:52.854-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 17:30:52 launchpad ollama[1537]: time=2025-02-22T17:30:52.855-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 17:30:52 launchpad ollama[1537]: time=2025-02-22T17:30:52.855-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3828889336/runners
+Feb 22 17:30:55 launchpad ollama[1537]: time=2025-02-22T17:30:55.846-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Feb 22 17:30:55 launchpad ollama[1537]: time=2025-02-22T17:30:55.846-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 17:30:55 launchpad ollama[1537]: time=2025-02-22T17:30:55.846-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 17:30:55 launchpad ollama[1537]: time=2025-02-22T17:30:55.846-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 17:30:55 launchpad ollama[1537]: time=2025-02-22T17:30:55.846-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 17:30:56 launchpad ollama[1537]: time=2025-02-22T17:30:56.029-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Feb 23 09:26:59 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 23 09:26:59 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 23 09:26:59 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 23 09:26:59 launchpad systemd[1]: ollama.service: Consumed 3.455s CPU time, 788.4M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 1e58dfadc1614a5e95da04bf186aedd0 --
+Feb 23 09:27:18 launchpad systemd[1]: Starting Server for local large language models...
+Feb 23 09:27:18 launchpad systemd[1]: Started Server for local large language models.
+Feb 23 09:27:18 launchpad ollama[1566]: 2025/02/23 09:27:18 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 23 09:27:18 launchpad ollama[1566]: time=2025-02-23T09:27:18.760-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 23 09:27:18 launchpad ollama[1566]: time=2025-02-23T09:27:18.767-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 23 09:27:18 launchpad ollama[1566]: time=2025-02-23T09:27:18.768-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 23 09:27:18 launchpad ollama[1566]: time=2025-02-23T09:27:18.769-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama121305121/runners
+Feb 23 09:27:21 launchpad ollama[1566]: time=2025-02-23T09:27:21.610-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 23 09:27:21 launchpad ollama[1566]: time=2025-02-23T09:27:21.610-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 23 09:27:21 launchpad ollama[1566]: time=2025-02-23T09:27:21.610-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 23 09:27:21 launchpad ollama[1566]: time=2025-02-23T09:27:21.611-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 23 09:27:21 launchpad ollama[1566]: time=2025-02-23T09:27:21.611-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 23 12:07:02 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: State 'stop-sigterm' timed out. Killing.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Killing process 1566 (.ollama-wrapped) with signal SIGKILL.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Killing process 1623 (.ollama-wrapped) with signal SIGKILL.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Killing process 1624 (n/a) with signal SIGKILL.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Killing process 1625 (n/a) with signal SIGKILL.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Killing process 1627 (n/a) with signal SIGKILL.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Main process exited, code=killed, status=9/KILL
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Failed with result 'timeout'.
+Feb 23 12:08:32 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Consumed 4.704s CPU time, 771.6M memory peak, 232.9M read from disk, 508.1M written to disk.
+-- Boot cea875ad09b44a3e90809d58d6431a00 --
+Feb 23 12:09:03 launchpad systemd[1]: Starting Server for local large language models...
+Feb 23 12:09:03 launchpad systemd[1]: Started Server for local large language models.
+Feb 23 12:09:03 launchpad ollama[1533]: 2025/02/23 12:09:03 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 23 12:09:03 launchpad ollama[1533]: time=2025-02-23T12:09:03.172-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 23 12:09:03 launchpad ollama[1533]: time=2025-02-23T12:09:03.177-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 23 12:09:03 launchpad ollama[1533]: time=2025-02-23T12:09:03.178-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 23 12:09:03 launchpad ollama[1533]: time=2025-02-23T12:09:03.180-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1424745992/runners
+Feb 23 12:09:06 launchpad ollama[1533]: time=2025-02-23T12:09:06.164-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 23 12:09:06 launchpad ollama[1533]: time=2025-02-23T12:09:06.165-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 23 12:09:06 launchpad ollama[1533]: time=2025-02-23T12:09:06.165-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 23 12:09:06 launchpad ollama[1533]: time=2025-02-23T12:09:06.165-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 23 12:09:06 launchpad ollama[1533]: time=2025-02-23T12:09:06.165-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 23 12:09:06 launchpad ollama[1533]: time=2025-02-23T12:09:06.354-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Feb 26 12:22:46 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 12:22:46 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 12:22:46 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 12:22:46 launchpad systemd[1]: ollama.service: Consumed 4.400s CPU time, 787.7M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot f2764ae1480f433eb1ba957d62a7667b --
+Feb 26 12:23:26 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 12:23:26 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 12:23:26 launchpad ollama[1532]: 2025/02/26 12:23:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 12:23:26 launchpad ollama[1532]: time=2025-02-26T12:23:26.480-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 12:23:26 launchpad ollama[1532]: time=2025-02-26T12:23:26.484-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 12:23:26 launchpad ollama[1532]: time=2025-02-26T12:23:26.485-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 12:23:26 launchpad ollama[1532]: time=2025-02-26T12:23:26.488-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2149963496/runners
+Feb 26 12:23:29 launchpad ollama[1532]: time=2025-02-26T12:23:29.325-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 12:23:29 launchpad ollama[1532]: time=2025-02-26T12:23:29.325-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 12:23:29 launchpad ollama[1532]: time=2025-02-26T12:23:29.325-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:23:29 launchpad ollama[1532]: time=2025-02-26T12:23:29.326-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:23:29 launchpad ollama[1532]: time=2025-02-26T12:23:29.326-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:23:37 launchpad ollama[1532]: time=2025-02-26T12:23:37.324-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Feb 26 12:23:58 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 12:23:58 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 12:23:58 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 12:23:58 launchpad systemd[1]: ollama.service: Consumed 11.151s CPU time, 787.4M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 70b89230457f412db030b8af1136a7c8 --
+Feb 26 12:24:29 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 12:24:29 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 12:24:29 launchpad ollama[1522]: 2025/02/26 12:24:29 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 12:24:29 launchpad ollama[1522]: time=2025-02-26T12:24:29.340-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 12:24:29 launchpad ollama[1522]: time=2025-02-26T12:24:29.345-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 12:24:29 launchpad ollama[1522]: time=2025-02-26T12:24:29.346-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 12:24:29 launchpad ollama[1522]: time=2025-02-26T12:24:29.350-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3150348286/runners
+Feb 26 12:24:32 launchpad ollama[1522]: time=2025-02-26T12:24:32.288-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 12:24:32 launchpad ollama[1522]: time=2025-02-26T12:24:32.289-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 12:24:32 launchpad ollama[1522]: time=2025-02-26T12:24:32.289-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:24:32 launchpad ollama[1522]: time=2025-02-26T12:24:32.289-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:24:32 launchpad ollama[1522]: time=2025-02-26T12:24:32.289-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:24:32 launchpad ollama[1522]: time=2025-02-26T12:24:32.503-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Feb 26 12:32:47 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 12:32:47 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 12:32:47 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 12:32:47 launchpad systemd[1]: ollama.service: Consumed 3.386s CPU time, 787.8M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot 5cc17d87b335478baf55194da99ed320 --
+Feb 26 12:33:18 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 12:33:18 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 12:33:18 launchpad ollama[1523]: 2025/02/26 12:33:18 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 12:33:18 launchpad ollama[1523]: time=2025-02-26T12:33:18.901-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 12:33:18 launchpad ollama[1523]: time=2025-02-26T12:33:18.908-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 12:33:18 launchpad ollama[1523]: time=2025-02-26T12:33:18.909-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 12:33:18 launchpad ollama[1523]: time=2025-02-26T12:33:18.911-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3865012489/runners
+Feb 26 12:33:21 launchpad ollama[1523]: time=2025-02-26T12:33:21.831-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Feb 26 12:33:21 launchpad ollama[1523]: time=2025-02-26T12:33:21.832-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 12:33:21 launchpad ollama[1523]: time=2025-02-26T12:33:21.832-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:33:21 launchpad ollama[1523]: time=2025-02-26T12:33:21.832-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:33:21 launchpad ollama[1523]: time=2025-02-26T12:33:21.832-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:33:22 launchpad ollama[1523]: time=2025-02-26T12:33:22.056-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Feb 26 12:36:51 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 12:36:52 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 12:36:52 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 12:36:52 launchpad systemd[1]: ollama.service: Consumed 3.354s CPU time, 787M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 0936d1c99120419cac9a87deaba8ae5c --
+Feb 26 12:37:23 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 12:37:23 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 12:37:23 launchpad ollama[1524]: 2025/02/26 12:37:23 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 12:37:23 launchpad ollama[1524]: time=2025-02-26T12:37:23.373-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 12:37:23 launchpad ollama[1524]: time=2025-02-26T12:37:23.377-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 12:37:23 launchpad ollama[1524]: time=2025-02-26T12:37:23.381-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 12:37:23 launchpad ollama[1524]: time=2025-02-26T12:37:23.383-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1493177641/runners
+Feb 26 12:37:26 launchpad ollama[1524]: time=2025-02-26T12:37:26.363-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Feb 26 12:37:26 launchpad ollama[1524]: time=2025-02-26T12:37:26.364-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 12:37:26 launchpad ollama[1524]: time=2025-02-26T12:37:26.364-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:37:26 launchpad ollama[1524]: time=2025-02-26T12:37:26.365-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:37:26 launchpad ollama[1524]: time=2025-02-26T12:37:26.365-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:37:26 launchpad ollama[1524]: time=2025-02-26T12:37:26.583-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Feb 26 15:03:01 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 15:03:01 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 15:03:01 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 15:03:01 launchpad systemd[1]: ollama.service: Consumed 3.498s CPU time, 787.5M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot 97d6655fe25445a8b4cd6aba50ef3cdb --
+Feb 26 15:03:34 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 15:03:34 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 15:03:34 launchpad ollama[1578]: 2025/02/26 15:03:34 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 15:03:34 launchpad ollama[1578]: time=2025-02-26T15:03:34.360-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 15:03:34 launchpad ollama[1578]: time=2025-02-26T15:03:34.365-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 15:03:34 launchpad ollama[1578]: time=2025-02-26T15:03:34.366-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 15:03:34 launchpad ollama[1578]: time=2025-02-26T15:03:34.368-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3125874339/runners
+Feb 26 15:03:37 launchpad ollama[1578]: time=2025-02-26T15:03:37.313-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 15:03:37 launchpad ollama[1578]: time=2025-02-26T15:03:37.313-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 15:03:37 launchpad ollama[1578]: time=2025-02-26T15:03:37.313-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:03:37 launchpad ollama[1578]: time=2025-02-26T15:03:37.314-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:03:37 launchpad ollama[1578]: time=2025-02-26T15:03:37.314-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:03:37 launchpad ollama[1578]: time=2025-02-26T15:03:37.566-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Feb 26 15:05:26 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 15:05:27 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 15:05:27 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 15:05:27 launchpad systemd[1]: ollama.service: Consumed 3.373s CPU time, 787.1M memory peak, 234.6M read from disk, 508.1M written to disk.
+-- Boot 15ffaa8cc0d94398a6a4c294dd4bfd4a --
+Feb 26 15:06:23 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 15:06:23 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 15:06:23 launchpad ollama[1583]: 2025/02/26 15:06:23 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 15:06:24 launchpad ollama[1583]: time=2025-02-26T15:06:24.003-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 15:06:24 launchpad ollama[1583]: time=2025-02-26T15:06:24.008-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 15:06:24 launchpad ollama[1583]: time=2025-02-26T15:06:24.009-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 15:06:24 launchpad ollama[1583]: time=2025-02-26T15:06:24.009-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3586662773/runners
+Feb 26 15:06:27 launchpad ollama[1583]: time=2025-02-26T15:06:27.021-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 15:06:27 launchpad ollama[1583]: time=2025-02-26T15:06:27.021-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 15:06:27 launchpad ollama[1583]: time=2025-02-26T15:06:27.022-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:06:27 launchpad ollama[1583]: time=2025-02-26T15:06:27.022-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:06:27 launchpad ollama[1583]: time=2025-02-26T15:06:27.022-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:06:27 launchpad ollama[1583]: time=2025-02-26T15:06:27.260-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Feb 26 15:08:41 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 15:08:41 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 15:08:41 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 15:08:41 launchpad systemd[1]: ollama.service: Consumed 3.470s CPU time, 787.8M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot e838850f300640b481e59975fb387ccf --
+Feb 26 15:09:11 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 15:09:12 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 15:09:12 launchpad ollama[1579]: 2025/02/26 15:09:12 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 15:09:12 launchpad ollama[1579]: time=2025-02-26T15:09:12.123-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 15:09:12 launchpad ollama[1579]: time=2025-02-26T15:09:12.127-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 15:09:12 launchpad ollama[1579]: time=2025-02-26T15:09:12.128-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 15:09:12 launchpad ollama[1579]: time=2025-02-26T15:09:12.129-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3477743635/runners
+Feb 26 15:09:15 launchpad ollama[1579]: time=2025-02-26T15:09:15.064-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 15:09:15 launchpad ollama[1579]: time=2025-02-26T15:09:15.065-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 15:09:15 launchpad ollama[1579]: time=2025-02-26T15:09:15.065-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:09:15 launchpad ollama[1579]: time=2025-02-26T15:09:15.066-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:09:15 launchpad ollama[1579]: time=2025-02-26T15:09:15.066-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:09:15 launchpad ollama[1579]: time=2025-02-26T15:09:15.290-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Feb 26 15:11:28 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 15:11:29 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 15:11:29 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 15:11:29 launchpad systemd[1]: ollama.service: Consumed 3.382s CPU time, 787.2M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 1d2da796c9bd4e5ab274940ad14b6e58 --
+Feb 26 15:12:00 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 15:12:00 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 15:12:00 launchpad ollama[1574]: 2025/02/26 15:12:00 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 15:12:00 launchpad ollama[1574]: time=2025-02-26T15:12:00.581-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 15:12:00 launchpad ollama[1574]: time=2025-02-26T15:12:00.586-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 15:12:00 launchpad ollama[1574]: time=2025-02-26T15:12:00.587-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 15:12:00 launchpad ollama[1574]: time=2025-02-26T15:12:00.589-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1532247760/runners
+Feb 26 15:12:03 launchpad ollama[1574]: time=2025-02-26T15:12:03.598-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 15:12:03 launchpad ollama[1574]: time=2025-02-26T15:12:03.598-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 15:12:03 launchpad ollama[1574]: time=2025-02-26T15:12:03.598-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:12:03 launchpad ollama[1574]: time=2025-02-26T15:12:03.599-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:12:03 launchpad ollama[1574]: time=2025-02-26T15:12:03.599-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:12:03 launchpad ollama[1574]: time=2025-02-26T15:12:03.846-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Feb 26 15:18:35 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 15:18:35 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 15:18:35 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 15:18:35 launchpad systemd[1]: ollama.service: Consumed 3.432s CPU time, 787.3M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 773915b1c3354fba8ededbdbe8db91d4 --
+Feb 26 15:19:07 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 15:19:07 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 15:19:08 launchpad ollama[1580]: 2025/02/26 15:19:08 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 15:19:08 launchpad ollama[1580]: time=2025-02-26T15:19:08.077-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 15:19:08 launchpad ollama[1580]: time=2025-02-26T15:19:08.081-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 15:19:08 launchpad ollama[1580]: time=2025-02-26T15:19:08.082-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 15:19:08 launchpad ollama[1580]: time=2025-02-26T15:19:08.084-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3665229947/runners
+Feb 26 15:19:11 launchpad ollama[1580]: time=2025-02-26T15:19:11.039-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 15:19:11 launchpad ollama[1580]: time=2025-02-26T15:19:11.040-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 15:19:11 launchpad ollama[1580]: time=2025-02-26T15:19:11.040-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:19:11 launchpad ollama[1580]: time=2025-02-26T15:19:11.041-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:19:11 launchpad ollama[1580]: time=2025-02-26T15:19:11.041-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:19:11 launchpad ollama[1580]: time=2025-02-26T15:19:11.297-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Feb 26 17:08:19 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 17:08:20 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 17:08:20 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 17:08:20 launchpad systemd[1]: ollama.service: Consumed 3.453s CPU time, 787.6M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot d7af577310c84eb597d71261f9d9c39a --
+Feb 26 17:08:55 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 17:08:55 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 17:08:55 launchpad ollama[1574]: 2025/02/26 17:08:55 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 17:08:55 launchpad ollama[1574]: time=2025-02-26T17:08:55.289-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 17:08:55 launchpad ollama[1574]: time=2025-02-26T17:08:55.294-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 17:08:55 launchpad ollama[1574]: time=2025-02-26T17:08:55.295-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 17:08:55 launchpad ollama[1574]: time=2025-02-26T17:08:55.298-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4184677761/runners
+Feb 26 17:08:58 launchpad ollama[1574]: time=2025-02-26T17:08:58.270-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Feb 26 17:08:58 launchpad ollama[1574]: time=2025-02-26T17:08:58.271-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 17:08:58 launchpad ollama[1574]: time=2025-02-26T17:08:58.271-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:08:58 launchpad ollama[1574]: time=2025-02-26T17:08:58.271-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:08:58 launchpad ollama[1574]: time=2025-02-26T17:08:58.271-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:08:58 launchpad ollama[1574]: time=2025-02-26T17:08:58.836-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Feb 26 17:27:49 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 17:27:49 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 17:27:49 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 17:27:49 launchpad systemd[1]: ollama.service: Consumed 3.447s CPU time, 787.3M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot 219199171ac04c159dfa8abb490de196 --
+Feb 26 17:28:22 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 17:28:22 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 17:28:22 launchpad ollama[1577]: 2025/02/26 17:28:22 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 17:28:22 launchpad ollama[1577]: time=2025-02-26T17:28:22.156-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 17:28:22 launchpad ollama[1577]: time=2025-02-26T17:28:22.164-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 17:28:22 launchpad ollama[1577]: time=2025-02-26T17:28:22.165-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 17:28:22 launchpad ollama[1577]: time=2025-02-26T17:28:22.166-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1559463410/runners
+Feb 26 17:28:25 launchpad ollama[1577]: time=2025-02-26T17:28:25.084-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 17:28:25 launchpad ollama[1577]: time=2025-02-26T17:28:25.085-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 17:28:25 launchpad ollama[1577]: time=2025-02-26T17:28:25.085-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:28:25 launchpad ollama[1577]: time=2025-02-26T17:28:25.085-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:28:25 launchpad ollama[1577]: time=2025-02-26T17:28:25.085-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:28:25 launchpad ollama[1577]: time=2025-02-26T17:28:25.328-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Feb 26 17:29:35 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 17:29:35 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 17:29:35 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 17:29:35 launchpad systemd[1]: ollama.service: Consumed 3.328s CPU time, 787.2M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 288488cd00584e369ea93802998b6fa9 --
+Feb 26 17:30:07 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 17:30:07 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 17:30:07 launchpad ollama[1577]: 2025/02/26 17:30:07 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 17:30:07 launchpad ollama[1577]: time=2025-02-26T17:30:07.318-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 17:30:07 launchpad ollama[1577]: time=2025-02-26T17:30:07.324-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 17:30:07 launchpad ollama[1577]: time=2025-02-26T17:30:07.326-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 17:30:07 launchpad ollama[1577]: time=2025-02-26T17:30:07.328-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama85053230/runners
+Feb 26 17:30:10 launchpad ollama[1577]: time=2025-02-26T17:30:10.265-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 17:30:10 launchpad ollama[1577]: time=2025-02-26T17:30:10.266-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 17:30:10 launchpad ollama[1577]: time=2025-02-26T17:30:10.266-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:30:10 launchpad ollama[1577]: time=2025-02-26T17:30:10.267-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:30:10 launchpad ollama[1577]: time=2025-02-26T17:30:10.267-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:30:10 launchpad ollama[1577]: time=2025-02-26T17:30:10.553-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Feb 28 14:02:12 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:02:12 | 200 |    1.209007ms |       127.0.0.1 | HEAD     "/"
+Feb 28 14:02:12 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:02:12 | 200 |   16.047359ms |       127.0.0.1 | POST     "/api/show"
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.814-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9214951424 required="6.2 GiB"
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.814-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.815-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.817-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34661"
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.817-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.817-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.817-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 14:02:13 launchpad ollama[114394]: INFO [main] build info | build=0 commit="unknown" tid="140506088456192" timestamp=1740780133
+Feb 28 14:02:13 launchpad ollama[114394]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140506088456192" timestamp=1740780133 total_threads=16
+Feb 28 14:02:13 launchpad ollama[114394]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34661" tid="140506088456192" timestamp=1740780133
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 14:02:13 launchpad ollama[1577]: time=2025-02-28T14:02:13.068-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 14:02:13 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 14:02:13 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 14:02:13 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 14:02:13 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 14:02:18 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 14:02:18 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 14:02:18 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 14:02:18 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 14:02:18 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 14:02:19 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 14:02:19 launchpad ollama[114394]: INFO [main] model loaded | tid="140506088456192" timestamp=1740780139
+Feb 28 14:02:19 launchpad ollama[1577]: time=2025-02-28T14:02:19.346-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.53 seconds"
+Feb 28 14:02:19 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:02:19 | 200 |   6.70611953s |       127.0.0.1 | POST     "/api/generate"
+Feb 28 14:03:52 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:03:52 | 200 |   9.54814712s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 14:08:12 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:08:12 | 200 | 12.130951516s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.728-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9217769472 required="6.2 GiB"
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.728-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="68.9 GiB"
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.729-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.730-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44135"
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.730-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.730-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.730-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 14:29:20 launchpad ollama[118548]: INFO [main] build info | build=0 commit="unknown" tid="140616155516928" timestamp=1740781760
+Feb 28 14:29:20 launchpad ollama[118548]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140616155516928" timestamp=1740781760 total_threads=16
+Feb 28 14:29:20 launchpad ollama[118548]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44135" tid="140616155516928" timestamp=1740781760
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.981-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 14:29:21 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 14:29:21 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 14:29:21 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 14:29:21 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 14:29:21 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 14:29:21 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 14:29:21 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 14:29:21 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 14:29:21 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 14:29:21 launchpad ollama[118548]: INFO [main] model loaded | tid="140616155516928" timestamp=1740781761
+Feb 28 14:29:21 launchpad ollama[1577]: time=2025-02-28T14:29:21.734-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 14:29:32 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:29:32 | 200 | 11.611788012s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.366-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9223798784 required="6.2 GiB"
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.366-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.367-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.368-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35261"
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.368-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.368-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.368-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 14:36:39 launchpad ollama[119666]: INFO [main] build info | build=0 commit="unknown" tid="140354746605568" timestamp=1740782199
+Feb 28 14:36:39 launchpad ollama[119666]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140354746605568" timestamp=1740782199 total_threads=16
+Feb 28 14:36:39 launchpad ollama[119666]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35261" tid="140354746605568" timestamp=1740782199
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.619-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 14:36:39 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 14:36:39 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 14:36:39 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 14:36:39 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 14:36:40 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 14:36:40 launchpad ollama[119666]: INFO [main] model loaded | tid="140354746605568" timestamp=1740782200
+Feb 28 14:36:40 launchpad ollama[1577]: time=2025-02-28T14:36:40.372-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 14:36:47 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:36:47 | 200 |  8.488455937s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 14:40:21 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:40:21 | 200 | 10.054006302s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.124-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9215475712 required="6.2 GiB"
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.124-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.125-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.126-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34885"
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.126-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.126-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.126-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 14:47:20 launchpad ollama[121414]: INFO [main] build info | build=0 commit="unknown" tid="140576655659008" timestamp=1740782840
+Feb 28 14:47:20 launchpad ollama[121414]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140576655659008" timestamp=1740782840 total_threads=16
+Feb 28 14:47:20 launchpad ollama[121414]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34885" tid="140576655659008" timestamp=1740782840
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.377-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 14:47:20 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 14:47:20 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 14:47:20 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 14:47:20 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 14:47:21 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 14:47:21 launchpad ollama[121414]: INFO [main] model loaded | tid="140576655659008" timestamp=1740782841
+Feb 28 14:47:21 launchpad ollama[1577]: time=2025-02-28T14:47:21.130-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 14:47:32 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:47:32 | 200 | 12.711746214s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.291-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9215803392 required="6.2 GiB"
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.291-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.292-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.292-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39517"
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.293-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.293-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.293-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 15:01:37 launchpad ollama[123542]: INFO [main] build info | build=0 commit="unknown" tid="140152201580544" timestamp=1740783697
+Feb 28 15:01:37 launchpad ollama[123542]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140152201580544" timestamp=1740783697 total_threads=16
+Feb 28 15:01:37 launchpad ollama[123542]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39517" tid="140152201580544" timestamp=1740783697
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.544-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 15:01:37 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 15:01:37 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 15:01:37 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 15:01:37 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 15:01:38 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 15:01:38 launchpad ollama[123542]: INFO [main] model loaded | tid="140152201580544" timestamp=1740783698
+Feb 28 15:01:38 launchpad ollama[1577]: time=2025-02-28T15:01:38.298-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 15:01:48 launchpad ollama[1577]: [GIN] 2025/02/28 - 15:01:48 | 200 | 11.520683682s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.386-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9193324544 required="6.2 GiB"
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.386-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="68.9 GiB"
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.386-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.387-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33669"
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.387-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.387-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.387-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 15:14:04 launchpad ollama[125470]: INFO [main] build info | build=0 commit="unknown" tid="140538006142976" timestamp=1740784444
+Feb 28 15:14:04 launchpad ollama[125470]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140538006142976" timestamp=1740784444 total_threads=16
+Feb 28 15:14:04 launchpad ollama[125470]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33669" tid="140538006142976" timestamp=1740784444
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.639-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 15:14:04 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 15:14:04 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 15:14:04 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 15:14:04 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 15:14:05 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 15:14:05 launchpad ollama[125470]: INFO [main] model loaded | tid="140538006142976" timestamp=1740784445
+Feb 28 15:14:05 launchpad ollama[1577]: time=2025-02-28T15:14:05.392-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 15:14:10 launchpad ollama[1577]: [GIN] 2025/02/28 - 15:14:10 | 200 |  6.596609391s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.324-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9217507328 required="6.2 GiB"
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.324-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="68.9 GiB"
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.324-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.325-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39179"
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.325-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.325-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.325-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 15:46:22 launchpad ollama[131154]: INFO [main] build info | build=0 commit="unknown" tid="139824834080768" timestamp=1740786382
+Feb 28 15:46:22 launchpad ollama[131154]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139824834080768" timestamp=1740786382 total_threads=16
+Feb 28 15:46:22 launchpad ollama[131154]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39179" tid="139824834080768" timestamp=1740786382
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.576-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 15:46:22 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 15:46:22 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 15:46:22 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 15:46:22 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 15:46:23 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 15:46:23 launchpad ollama[131154]: INFO [main] model loaded | tid="139824834080768" timestamp=1740786383
+Feb 28 15:46:23 launchpad ollama[1577]: time=2025-02-28T15:46:23.329-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 15:46:34 launchpad ollama[1577]: [GIN] 2025/02/28 - 15:46:34 | 200 | 12.125250601s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.659-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9217900544 required="6.2 GiB"
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.659-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="68.9 GiB"
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.659-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.661-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44653"
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.661-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.661-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.661-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 15:51:41 launchpad ollama[131955]: INFO [main] build info | build=0 commit="unknown" tid="139659496579072" timestamp=1740786701
+Feb 28 15:51:41 launchpad ollama[131955]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139659496579072" timestamp=1740786701 total_threads=16
+Feb 28 15:51:41 launchpad ollama[131955]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44653" tid="139659496579072" timestamp=1740786701
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.912-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 15:51:41 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 15:51:41 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 15:51:41 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 15:51:41 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 15:51:42 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 15:51:42 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 15:51:42 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 15:51:42 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 15:51:42 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 15:51:42 launchpad ollama[131955]: INFO [main] model loaded | tid="139659496579072" timestamp=1740786702
+Feb 28 15:51:42 launchpad ollama[1577]: time=2025-02-28T15:51:42.665-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 15:51:51 launchpad ollama[1577]: [GIN] 2025/02/28 - 15:51:51 | 200 |  9.682349236s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.708-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9218490368 required="6.2 GiB"
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.708-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="68.9 GiB"
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.709-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.710-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45959"
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.710-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.710-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.710-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 16:01:45 launchpad ollama[133439]: INFO [main] build info | build=0 commit="unknown" tid="140079823495168" timestamp=1740787305
+Feb 28 16:01:45 launchpad ollama[133439]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140079823495168" timestamp=1740787305 total_threads=16
+Feb 28 16:01:45 launchpad ollama[133439]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45959" tid="140079823495168" timestamp=1740787305
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.961-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 16:01:45 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 16:01:45 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 16:01:45 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 16:01:45 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 16:01:46 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 16:01:46 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 16:01:46 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 16:01:46 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 16:01:46 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 16:01:46 launchpad ollama[133439]: INFO [main] model loaded | tid="140079823495168" timestamp=1740787306
+Feb 28 16:01:46 launchpad ollama[1577]: time=2025-02-28T16:01:46.714-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 16:01:53 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:01:53 | 200 |  7.500569739s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:03:31 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:03:31 | 200 |  1.889825884s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:06:32 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:06:32 | 200 |   8.81532141s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:08:10 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:08:10 | 200 |  9.606859294s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.622-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9200926720 required="6.2 GiB"
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.622-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.622-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.623-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41925"
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.623-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.623-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.623-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 16:31:48 launchpad ollama[140473]: INFO [main] build info | build=0 commit="unknown" tid="140114432872448" timestamp=1740789108
+Feb 28 16:31:48 launchpad ollama[140473]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140114432872448" timestamp=1740789108 total_threads=16
+Feb 28 16:31:48 launchpad ollama[140473]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41925" tid="140114432872448" timestamp=1740789108
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.874-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 16:31:48 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 16:31:48 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 16:31:48 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 16:31:48 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 16:31:49 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 16:31:49 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 16:31:49 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 16:31:49 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 16:31:49 launchpad ollama[140473]: INFO [main] model loaded | tid="140114432872448" timestamp=1740789109
+Feb 28 16:31:49 launchpad ollama[1577]: time=2025-02-28T16:31:49.627-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 16:31:56 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:31:56 | 200 |  8.361332959s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:33:50 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:33:50 | 200 |  6.532573967s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:36:27 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:36:27 | 200 |  7.005495158s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:38:35 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:38:35 | 200 |  7.225236848s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:40:39 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:40:39 | 200 |  6.244688232s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.059-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9209184256 required="6.2 GiB"
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.059-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.059-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.060-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46827"
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.060-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.060-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.060-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 16:46:29 launchpad ollama[142637]: INFO [main] build info | build=0 commit="unknown" tid="139897028616192" timestamp=1740789989
+Feb 28 16:46:29 launchpad ollama[142637]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139897028616192" timestamp=1740789989 total_threads=16
+Feb 28 16:46:29 launchpad ollama[142637]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46827" tid="139897028616192" timestamp=1740789989
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.311-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 16:46:29 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 16:46:29 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 16:46:29 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 16:46:29 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 16:46:29 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 16:46:30 launchpad ollama[142637]: INFO [main] model loaded | tid="139897028616192" timestamp=1740789990
+Feb 28 16:46:30 launchpad ollama[1577]: time=2025-02-28T16:46:30.065-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 16:46:38 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:46:38 | 200 |  9.983171798s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.277-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9208856576 required="6.2 GiB"
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.277-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.278-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.279-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37471"
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.279-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.279-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.279-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 16:58:24 launchpad ollama[144397]: INFO [main] build info | build=0 commit="unknown" tid="140661142446080" timestamp=1740790704
+Feb 28 16:58:24 launchpad ollama[144397]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140661142446080" timestamp=1740790704 total_threads=16
+Feb 28 16:58:24 launchpad ollama[144397]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37471" tid="140661142446080" timestamp=1740790704
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.530-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 16:58:24 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 16:58:24 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 16:58:24 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 16:58:24 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 16:58:25 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 16:58:25 launchpad ollama[144397]: INFO [main] model loaded | tid="140661142446080" timestamp=1740790705
+Feb 28 16:58:25 launchpad ollama[1577]: time=2025-02-28T16:58:25.283-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 16:58:34 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:58:34 | 200 | 10.082412373s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.072-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9368961024 required="6.2 GiB"
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.072-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.072-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.073-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43047"
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.073-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.073-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.073-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 17:06:02 launchpad ollama[145542]: INFO [main] build info | build=0 commit="unknown" tid="140694722121728" timestamp=1740791162
+Feb 28 17:06:02 launchpad ollama[145542]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140694722121728" timestamp=1740791162 total_threads=16
+Feb 28 17:06:02 launchpad ollama[145542]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43047" tid="140694722121728" timestamp=1740791162
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.324-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 17:06:02 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 17:06:02 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 17:06:02 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 17:06:02 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 17:06:02 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 17:06:03 launchpad ollama[145542]: INFO [main] model loaded | tid="140694722121728" timestamp=1740791163
+Feb 28 17:06:03 launchpad ollama[1577]: time=2025-02-28T17:06:03.077-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 17:06:13 launchpad ollama[1577]: [GIN] 2025/02/28 - 17:06:13 | 200 | 11.290778531s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.362-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9368764416 required="6.2 GiB"
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.362-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.362-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.363-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34297"
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.363-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.363-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.364-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 17:22:02 launchpad ollama[147869]: INFO [main] build info | build=0 commit="unknown" tid="139994690822144" timestamp=1740792122
+Feb 28 17:22:02 launchpad ollama[147869]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139994690822144" timestamp=1740792122 total_threads=16
+Feb 28 17:22:02 launchpad ollama[147869]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34297" tid="139994690822144" timestamp=1740792122
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.615-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 17:22:02 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 17:22:02 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 17:22:02 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 17:22:02 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 17:22:03 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 17:22:03 launchpad ollama[147869]: INFO [main] model loaded | tid="139994690822144" timestamp=1740792123
+Feb 28 17:22:03 launchpad ollama[1577]: time=2025-02-28T17:22:03.368-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 17:22:12 launchpad ollama[1577]: [GIN] 2025/02/28 - 17:22:12 | 200 | 10.311215036s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.224-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9369092096 required="6.2 GiB"
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.224-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.224-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.225-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43575"
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.225-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.225-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.225-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 17:35:40 launchpad ollama[149852]: INFO [main] build info | build=0 commit="unknown" tid="139748682330112" timestamp=1740792940
+Feb 28 17:35:40 launchpad ollama[149852]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139748682330112" timestamp=1740792940 total_threads=16
+Feb 28 17:35:40 launchpad ollama[149852]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43575" tid="139748682330112" timestamp=1740792940
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.476-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 17:35:40 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 17:35:40 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 17:35:40 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 17:35:40 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 17:35:41 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 17:35:41 launchpad ollama[149852]: INFO [main] model loaded | tid="139748682330112" timestamp=1740792941
+Feb 28 17:35:41 launchpad ollama[1577]: time=2025-02-28T17:35:41.229-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 17:35:51 launchpad ollama[1577]: [GIN] 2025/02/28 - 17:35:51 | 200 | 11.237254254s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.028-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9158852608 required="6.2 GiB"
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.028-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.028-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.029-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41533"
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.029-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.029-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.029-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 17:46:15 launchpad ollama[151489]: INFO [main] build info | build=0 commit="unknown" tid="139945581817856" timestamp=1740793575
+Feb 28 17:46:15 launchpad ollama[151489]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139945581817856" timestamp=1740793575 total_threads=16
+Feb 28 17:46:15 launchpad ollama[151489]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41533" tid="139945581817856" timestamp=1740793575
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.281-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 17:46:15 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 17:46:15 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 17:46:15 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 17:46:15 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 17:46:15 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 17:46:16 launchpad ollama[151489]: INFO [main] model loaded | tid="139945581817856" timestamp=1740793576
+Feb 28 17:46:16 launchpad ollama[1577]: time=2025-02-28T17:46:16.286-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Feb 28 17:46:25 launchpad ollama[1577]: [GIN] 2025/02/28 - 17:46:25 | 200 | 10.663781668s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.765-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9159049216 required="6.2 GiB"
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.766-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.766-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.767-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37203"
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.767-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.767-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.767-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 17:55:59 launchpad ollama[152915]: INFO [main] build info | build=0 commit="unknown" tid="139748496625664" timestamp=1740794159
+Feb 28 17:55:59 launchpad ollama[152915]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139748496625664" timestamp=1740794159 total_threads=16
+Feb 28 17:55:59 launchpad ollama[152915]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37203" tid="139748496625664" timestamp=1740794159
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 17:56:00 launchpad ollama[1577]: time=2025-02-28T17:56:00.018-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 17:56:00 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 17:56:00 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 17:56:00 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 17:56:00 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 17:56:00 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 17:56:00 launchpad ollama[152915]: INFO [main] model loaded | tid="139748496625664" timestamp=1740794160
+Feb 28 17:56:00 launchpad ollama[1577]: time=2025-02-28T17:56:00.772-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Feb 28 17:56:09 launchpad ollama[1577]: [GIN] 2025/02/28 - 17:56:09 | 200 |  9.701286012s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 17:58:09 launchpad ollama[1577]: [GIN] 2025/02/28 - 17:58:09 | 200 |  6.213241985s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.920-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9158983680 required="6.2 GiB"
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.920-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.921-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.922-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38017"
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.922-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.922-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.922-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 18:03:11 launchpad ollama[154015]: INFO [main] build info | build=0 commit="unknown" tid="140487825596416" timestamp=1740794591
+Feb 28 18:03:11 launchpad ollama[154015]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140487825596416" timestamp=1740794591 total_threads=16
+Feb 28 18:03:11 launchpad ollama[154015]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38017" tid="140487825596416" timestamp=1740794591
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 18:03:12 launchpad ollama[1577]: time=2025-02-28T18:03:12.173-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 18:03:12 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 18:03:12 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 18:03:12 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 18:03:12 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 18:03:12 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 18:03:12 launchpad ollama[154015]: INFO [main] model loaded | tid="140487825596416" timestamp=1740794592
+Feb 28 18:03:12 launchpad ollama[1577]: time=2025-02-28T18:03:12.926-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 18:03:23 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:03:23 | 200 | 11.507349113s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.204-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9159311360 required="6.2 GiB"
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.204-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.204-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.205-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42861"
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.205-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.205-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.206-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 18:09:16 launchpad ollama[154906]: INFO [main] build info | build=0 commit="unknown" tid="139802792906752" timestamp=1740794956
+Feb 28 18:09:16 launchpad ollama[154906]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139802792906752" timestamp=1740794956 total_threads=16
+Feb 28 18:09:16 launchpad ollama[154906]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42861" tid="139802792906752" timestamp=1740794956
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.456-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 18:09:16 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 18:09:16 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 18:09:16 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 18:09:16 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 18:09:17 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 18:09:17 launchpad ollama[154906]: INFO [main] model loaded | tid="139802792906752" timestamp=1740794957
+Feb 28 18:09:17 launchpad ollama[1577]: time=2025-02-28T18:09:17.461-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Feb 28 18:09:25 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:09:25 | 200 |  9.757300442s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.434-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9158787072 required="6.2 GiB"
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.435-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.435-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.436-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38313"
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.436-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.436-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.436-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 18:19:27 launchpad ollama[156404]: INFO [main] build info | build=0 commit="unknown" tid="139803478745088" timestamp=1740795567
+Feb 28 18:19:27 launchpad ollama[156404]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139803478745088" timestamp=1740795567 total_threads=16
+Feb 28 18:19:27 launchpad ollama[156404]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38313" tid="139803478745088" timestamp=1740795567
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.687-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 18:19:27 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 18:19:27 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 18:19:27 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 18:19:27 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 18:19:28 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 18:19:28 launchpad ollama[156404]: INFO [main] model loaded | tid="139803478745088" timestamp=1740795568
+Feb 28 18:19:28 launchpad ollama[1577]: time=2025-02-28T18:19:28.440-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 18:19:34 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:19:34 | 200 |  7.046852586s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:21:45 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:21:45 | 200 |  4.693537262s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:24:54 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:24:54 | 200 |  8.155017137s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:28:27 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:28:27 | 200 |  7.947639339s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:33:12 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:33:12 | 200 |  7.435491807s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:34:39 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:34:39 | 200 |  7.483601103s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:37:23 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:37:23 | 200 | 13.887015441s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:39:39 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:39:39 | 200 | 12.413931467s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.915-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9192407040 required="6.2 GiB"
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.916-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.916-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.917-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36579"
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.917-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.917-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.917-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 18:45:40 launchpad ollama[160387]: INFO [main] build info | build=0 commit="unknown" tid="140624644788224" timestamp=1740797140
+Feb 28 18:45:40 launchpad ollama[160387]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140624644788224" timestamp=1740797140 total_threads=16
+Feb 28 18:45:40 launchpad ollama[160387]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36579" tid="140624644788224" timestamp=1740797140
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 18:45:41 launchpad ollama[1577]: time=2025-02-28T18:45:41.168-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 18:45:41 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 18:45:41 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 18:45:41 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 18:45:41 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 18:45:41 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 18:45:41 launchpad ollama[160387]: INFO [main] model loaded | tid="140624644788224" timestamp=1740797141
+Feb 28 18:45:41 launchpad ollama[1577]: time=2025-02-28T18:45:41.920-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 18:45:53 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:45:53 | 200 | 13.180541336s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:48:16 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:48:16 | 200 |  5.436342371s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:49:33 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:49:33 | 200 | 13.168018795s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:52:19 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:52:19 | 200 |  4.436562112s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.424-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9200467968 required="6.2 GiB"
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.424-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.425-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.426-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36111"
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.426-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.426-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.426-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 11:12:22 launchpad ollama[175662]: INFO [main] build info | build=0 commit="unknown" tid="140502316916736" timestamp=1740856342
+Mar 01 11:12:22 launchpad ollama[175662]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140502316916736" timestamp=1740856342 total_threads=16
+Mar 01 11:12:22 launchpad ollama[175662]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36111" tid="140502316916736" timestamp=1740856342
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.677-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 11:12:22 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 11:12:22 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 11:12:22 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 11:12:22 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 11:12:23 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 11:12:23 launchpad ollama[175662]: INFO [main] model loaded | tid="140502316916736" timestamp=1740856343
+Mar 01 11:12:23 launchpad ollama[1577]: time=2025-03-01T11:12:23.430-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 11:12:29 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:12:29 | 200 |  6.927777363s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.853-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9220128768 required="6.2 GiB"
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.853-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.853-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.854-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39789"
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.855-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.855-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.855-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 11:18:29 launchpad ollama[176582]: INFO [main] build info | build=0 commit="unknown" tid="140010778365952" timestamp=1740856709
+Mar 01 11:18:29 launchpad ollama[176582]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140010778365952" timestamp=1740856709 total_threads=16
+Mar 01 11:18:29 launchpad ollama[176582]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39789" tid="140010778365952" timestamp=1740856709
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 11:18:30 launchpad ollama[1577]: time=2025-03-01T11:18:30.106-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 11:18:30 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 11:18:30 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 11:18:30 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 11:18:30 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 11:18:30 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 11:18:30 launchpad ollama[176582]: INFO [main] model loaded | tid="140010778365952" timestamp=1740856710
+Mar 01 11:18:30 launchpad ollama[1577]: time=2025-03-01T11:18:30.858-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 11:18:36 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:18:36 | 200 |  7.292746198s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.615-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9204793344 required="6.2 GiB"
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.615-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.615-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.616-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34011"
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.616-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.616-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.616-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 11:36:13 launchpad ollama[179702]: INFO [main] build info | build=0 commit="unknown" tid="140036986118144" timestamp=1740857773
+Mar 01 11:36:13 launchpad ollama[179702]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140036986118144" timestamp=1740857773 total_threads=16
+Mar 01 11:36:13 launchpad ollama[179702]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34011" tid="140036986118144" timestamp=1740857773
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.867-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 11:36:13 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 11:36:13 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 11:36:13 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 11:36:13 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 11:36:14 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 11:36:14 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 11:36:14 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 11:36:14 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 11:36:14 launchpad ollama[179702]: INFO [main] model loaded | tid="140036986118144" timestamp=1740857774
+Mar 01 11:36:14 launchpad ollama[1577]: time=2025-03-01T11:36:14.621-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 11:36:19 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:36:19 | 200 |   6.23055622s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:37:46 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:37:46 | 200 |  4.635591282s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:42:43 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:42:43 | 200 |  5.334304431s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:43:36 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:43:36 | 200 |  3.389103063s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.793-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9364897792 required="6.2 GiB"
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.793-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.793-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.794-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35295"
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.795-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.795-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.795-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 11:50:35 launchpad ollama[182689]: INFO [main] build info | build=0 commit="unknown" tid="139639387136000" timestamp=1740858635
+Mar 01 11:50:35 launchpad ollama[182689]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139639387136000" timestamp=1740858635 total_threads=16
+Mar 01 11:50:35 launchpad ollama[182689]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35295" tid="139639387136000" timestamp=1740858635
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 11:50:36 launchpad ollama[1577]: time=2025-03-01T11:50:36.045-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 11:50:36 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 11:50:36 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 11:50:36 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 11:50:36 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 11:50:36 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 11:50:36 launchpad ollama[182689]: INFO [main] model loaded | tid="139639387136000" timestamp=1740858636
+Mar 01 11:50:36 launchpad ollama[1577]: time=2025-03-01T11:50:36.798-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 11:50:43 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:50:43 | 200 |  8.024380557s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.565-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9364570112 required="6.2 GiB"
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.565-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.565-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.566-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38443"
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.566-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.566-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.566-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 11:56:24 launchpad ollama[183553]: INFO [main] build info | build=0 commit="unknown" tid="139662885605376" timestamp=1740858984
+Mar 01 11:56:24 launchpad ollama[183553]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139662885605376" timestamp=1740858984 total_threads=16
+Mar 01 11:56:24 launchpad ollama[183553]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38443" tid="139662885605376" timestamp=1740858984
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.817-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 11:56:24 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 11:56:24 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 11:56:24 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 11:56:24 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 11:56:25 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 11:56:25 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 11:56:25 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 11:56:25 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 11:56:25 launchpad ollama[183553]: INFO [main] model loaded | tid="139662885605376" timestamp=1740858985
+Mar 01 11:56:25 launchpad ollama[1577]: time=2025-03-01T11:56:25.571-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 01 11:56:33 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:56:33 | 200 |  8.853137861s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:56:57 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:56:57 | 200 |  4.653598121s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.717-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9365159936 required="6.2 GiB"
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.717-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.717-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.718-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36717"
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.718-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.718-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.718-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 12:04:55 launchpad ollama[184840]: INFO [main] build info | build=0 commit="unknown" tid="140640671145984" timestamp=1740859495
+Mar 01 12:04:55 launchpad ollama[184840]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140640671145984" timestamp=1740859495 total_threads=16
+Mar 01 12:04:55 launchpad ollama[184840]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36717" tid="140640671145984" timestamp=1740859495
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.970-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 12:04:56 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 12:04:56 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 12:04:56 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 12:04:56 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 12:04:56 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 12:04:56 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 12:04:56 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 12:04:56 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 12:04:56 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 12:04:56 launchpad ollama[184840]: INFO [main] model loaded | tid="140640671145984" timestamp=1740859496
+Mar 01 12:04:56 launchpad ollama[1577]: time=2025-03-01T12:04:56.724-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 01 12:05:05 launchpad ollama[1577]: [GIN] 2025/03/01 - 12:05:05 | 200 | 10.042959498s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.518-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9160818688 required="6.2 GiB"
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.518-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.519-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.519-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43851"
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.520-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.520-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.520-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 12:18:10 launchpad ollama[187318]: INFO [main] build info | build=0 commit="unknown" tid="140659319099392" timestamp=1740860290
+Mar 01 12:18:10 launchpad ollama[187318]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140659319099392" timestamp=1740860290 total_threads=16
+Mar 01 12:18:10 launchpad ollama[187318]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43851" tid="140659319099392" timestamp=1740860290
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.770-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 12:18:10 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 12:18:10 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 12:18:10 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 12:18:10 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 12:18:11 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 12:18:11 launchpad ollama[187318]: INFO [main] model loaded | tid="140659319099392" timestamp=1740860291
+Mar 01 12:18:11 launchpad ollama[1577]: time=2025-03-01T12:18:11.523-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 12:18:20 launchpad ollama[1577]: [GIN] 2025/03/01 - 12:18:20 | 200 | 10.317186819s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:22:20 launchpad ollama[1577]: [GIN] 2025/03/01 - 12:22:20 | 200 |  2.432697878s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:22:46 launchpad ollama[1577]: [GIN] 2025/03/01 - 12:22:46 | 200 |  1.239694497s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:23:16 launchpad ollama[1577]: [GIN] 2025/03/01 - 12:23:16 | 200 |  2.620858532s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:24:32 launchpad ollama[1577]: [GIN] 2025/03/01 - 12:24:32 | 200 |  2.535180181s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:29:02 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 01 12:29:04 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 01 12:29:04 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 01 12:29:04 launchpad systemd[1]: ollama.service: Consumed 8min 8.911s CPU time, 5.7G memory peak, 4.6G read from disk, 508.1M written to disk, 15.1M incoming IP traffic, 13.8M outgoing IP traffic.
+-- Boot 326ba5c09665430bb47760ff439ddc9d --
+Mar 01 12:31:20 launchpad systemd[1]: Starting Server for local large language models...
+Mar 01 12:31:20 launchpad systemd[1]: Started Server for local large language models.
+Mar 01 12:31:20 launchpad ollama[1578]: 2025/03/01 12:31:20 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 01 12:31:20 launchpad ollama[1578]: time=2025-03-01T12:31:20.491-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 01 12:31:20 launchpad ollama[1578]: time=2025-03-01T12:31:20.497-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 01 12:31:20 launchpad ollama[1578]: time=2025-03-01T12:31:20.498-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 01 12:31:20 launchpad ollama[1578]: time=2025-03-01T12:31:20.498-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4107338876/runners
+Mar 01 12:31:23 launchpad ollama[1578]: time=2025-03-01T12:31:23.507-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Mar 01 12:31:23 launchpad ollama[1578]: time=2025-03-01T12:31:23.507-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 01 12:31:23 launchpad ollama[1578]: time=2025-03-01T12:31:23.508-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 01 12:31:23 launchpad ollama[1578]: time=2025-03-01T12:31:23.508-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 01 12:31:23 launchpad ollama[1578]: time=2025-03-01T12:31:23.508-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 01 12:31:23 launchpad ollama[1578]: time=2025-03-01T12:31:23.760-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 01 12:39:09 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:39:09 | 200 |    1.123129ms |       127.0.0.1 | HEAD     "/"
+Mar 01 12:39:09 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:39:09 | 200 |   16.386631ms |       127.0.0.1 | POST     "/api/show"
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.716-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10398269440 required="6.2 GiB"
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.717-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.4 GiB" free_swap="68.9 GiB"
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.717-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.718-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33739"
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.718-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.718-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.719-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 12:39:09 launchpad ollama[6231]: INFO [main] build info | build=0 commit="unknown" tid="140044305031168" timestamp=1740861549
+Mar 01 12:39:09 launchpad ollama[6231]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140044305031168" timestamp=1740861549 total_threads=16
+Mar 01 12:39:09 launchpad ollama[6231]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33739" tid="140044305031168" timestamp=1740861549
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.970-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 01 12:39:10 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 12:39:10 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 12:39:10 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 12:39:10 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 12:39:15 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 12:39:15 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 12:39:15 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 01 12:39:15 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 01 12:39:15 launchpad ollama[6231]: INFO [main] model loaded | tid="140044305031168" timestamp=1740861555
+Mar 01 12:39:15 launchpad ollama[1578]: time=2025-03-01T12:39:15.989-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Mar 01 12:39:15 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:39:15 | 200 |   6.44667237s |       127.0.0.1 | POST     "/api/generate"
+Mar 01 12:41:46 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:41:46 | 200 |  8.404180719s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.817-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10412621824 required="6.2 GiB"
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.817-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.4 GiB" free_swap="68.9 GiB"
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.818-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.818-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38815"
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.819-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.819-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.819-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 12:47:15 launchpad ollama[7489]: INFO [main] build info | build=0 commit="unknown" tid="140573945192448" timestamp=1740862035
+Mar 01 12:47:15 launchpad ollama[7489]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140573945192448" timestamp=1740862035 total_threads=16
+Mar 01 12:47:15 launchpad ollama[7489]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38815" tid="140573945192448" timestamp=1740862035
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 01 12:47:16 launchpad ollama[1578]: time=2025-03-01T12:47:16.070-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 01 12:47:16 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 12:47:16 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 12:47:16 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 12:47:16 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 01 12:47:16 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 01 12:47:16 launchpad ollama[7489]: INFO [main] model loaded | tid="140573945192448" timestamp=1740862036
+Mar 01 12:47:16 launchpad ollama[1578]: time=2025-03-01T12:47:16.822-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 12:47:22 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:47:22 | 200 |   7.34386985s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.625-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10420944896 required="6.2 GiB"
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.625-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.4 GiB" free_swap="68.9 GiB"
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.626-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.626-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 32795"
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.627-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.627-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.627-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 12:55:11 launchpad ollama[8645]: INFO [main] build info | build=0 commit="unknown" tid="140264708308992" timestamp=1740862511
+Mar 01 12:55:11 launchpad ollama[8645]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140264708308992" timestamp=1740862511 total_threads=16
+Mar 01 12:55:11 launchpad ollama[8645]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32795" tid="140264708308992" timestamp=1740862511
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.878-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 01 12:55:11 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 12:55:11 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 12:55:11 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 12:55:11 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 12:55:12 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 12:55:12 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 12:55:12 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 01 12:55:12 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 01 12:55:12 launchpad ollama[8645]: INFO [main] model loaded | tid="140264708308992" timestamp=1740862512
+Mar 01 12:55:12 launchpad ollama[1578]: time=2025-03-01T12:55:12.631-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 12:55:21 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:55:21 | 200 | 10.523565209s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:58:22 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:58:22 | 200 |  7.212702764s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 13:02:07 launchpad ollama[1578]: [GIN] 2025/03/01 - 13:02:07 | 200 |  6.981875324s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 13:05:28 launchpad ollama[1578]: [GIN] 2025/03/01 - 13:05:28 | 200 |  9.350942946s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 13:09:20 launchpad ollama[1578]: [GIN] 2025/03/01 - 13:09:20 | 200 |  7.443790312s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 17:47:32 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:47:32 | 200 |      16.092µs |       127.0.0.1 | HEAD     "/"
+Mar 03 17:47:32 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:47:32 | 200 |   12.822257ms |       127.0.0.1 | POST     "/api/show"
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.738-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10137108480 required="6.2 GiB"
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.738-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.739-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.740-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33649"
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.740-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.740-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.740-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 03 17:47:32 launchpad ollama[138722]: INFO [main] build info | build=0 commit="unknown" tid="140182838722560" timestamp=1741052852
+Mar 03 17:47:32 launchpad ollama[138722]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140182838722560" timestamp=1741052852 total_threads=16
+Mar 03 17:47:32 launchpad ollama[138722]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33649" tid="140182838722560" timestamp=1741052852
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 03 17:47:32 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.991-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 03 17:47:33 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 03 17:47:33 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 03 17:47:33 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 03 17:47:33 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 03 17:47:33 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 03 17:47:33 launchpad ollama[138722]: INFO [main] model loaded | tid="140182838722560" timestamp=1741052853
+Mar 03 17:47:33 launchpad ollama[1578]: time=2025-03-03T17:47:33.744-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 03 17:47:33 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:47:33 | 200 |    1.1873722s |       127.0.0.1 | POST     "/api/generate"
+Mar 03 17:48:57 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:48:57 | 200 |  9.301976434s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 17:50:07 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:50:07 | 200 |  2.790235696s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 17:51:23 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:51:23 | 200 |  2.949668506s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 17:53:43 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:53:43 | 200 |  7.528375037s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 17:58:46 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:58:46 | 200 |  5.971180011s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:01:10 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:01:10 | 200 |  6.007280076s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:03:23 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:03:23 | 200 |  7.641810212s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:06:14 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:06:14 | 200 | 11.400963968s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:09:00 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:09:00 | 200 | 10.117915402s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:11:41 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:11:41 | 200 | 11.023541934s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:14:40 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:14:40 | 200 |  6.319622758s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:16:36 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:16:36 | 200 | 12.224033757s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:20:15 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:20:15 | 200 | 12.105570051s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:22:13 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:22:13 | 200 | 12.579028629s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:25:39 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:25:39 | 200 |   8.22646555s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.461-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10137763840 required="6.2 GiB"
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.461-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.462-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.463-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41337"
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.463-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.463-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.463-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 03 18:47:32 launchpad ollama[150110]: INFO [main] build info | build=0 commit="unknown" tid="140101783797760" timestamp=1741056452
+Mar 03 18:47:32 launchpad ollama[150110]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140101783797760" timestamp=1741056452 total_threads=16
+Mar 03 18:47:32 launchpad ollama[150110]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41337" tid="140101783797760" timestamp=1741056452
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.714-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 03 18:47:32 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 03 18:47:32 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 03 18:47:32 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 03 18:47:32 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 03 18:47:33 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 03 18:47:33 launchpad ollama[150110]: INFO [main] model loaded | tid="140101783797760" timestamp=1741056453
+Mar 03 18:47:33 launchpad ollama[1578]: time=2025-03-03T18:47:33.466-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 03 18:47:42 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:47:42 | 200 | 10.500599287s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:52:22 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:52:22 | 200 |  7.587219807s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:54:27 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:54:27 | 200 |  8.470405711s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:57:52 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:57:52 | 200 |  4.447967803s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:59:14 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:59:14 | 200 |  2.824189707s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:03:59 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:03:59 | 200 | 10.396395011s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:07:13 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:07:13 | 200 |  6.766175494s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:11:18 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:11:18 | 200 |  7.686619617s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:13:03 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:13:03 | 200 |  7.175445461s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:14:48 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:14:48 | 200 |  5.937771734s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:19:20 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:19:20 | 200 |  5.136648442s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.371-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10108403712 required="6.2 GiB"
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.371-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.6 GiB" free_swap="68.9 GiB"
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.372-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.373-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39841"
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.373-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.373-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.373-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 03 19:26:01 launchpad ollama[156193]: INFO [main] build info | build=0 commit="unknown" tid="140256606535680" timestamp=1741058761
+Mar 03 19:26:01 launchpad ollama[156193]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140256606535680" timestamp=1741058761 total_threads=16
+Mar 03 19:26:01 launchpad ollama[156193]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39841" tid="140256606535680" timestamp=1741058761
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.624-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 03 19:26:01 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 03 19:26:01 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 03 19:26:01 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 03 19:26:01 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 03 19:26:02 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 03 19:26:02 launchpad ollama[156193]: INFO [main] model loaded | tid="140256606535680" timestamp=1741058762
+Mar 03 19:26:02 launchpad ollama[1578]: time=2025-03-03T19:26:02.376-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 03 19:26:09 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:26:09 | 200 |  8.429151547s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.128-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10108403712 required="6.2 GiB"
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.128-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.6 GiB" free_swap="68.9 GiB"
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.128-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.129-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41563"
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.129-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.129-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.129-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 03 19:32:12 launchpad ollama[157111]: INFO [main] build info | build=0 commit="unknown" tid="139874576224256" timestamp=1741059132
+Mar 03 19:32:12 launchpad ollama[157111]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139874576224256" timestamp=1741059132 total_threads=16
+Mar 03 19:32:12 launchpad ollama[157111]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41563" tid="139874576224256" timestamp=1741059132
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.381-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 03 19:32:12 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 03 19:32:12 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 03 19:32:12 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 03 19:32:12 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 03 19:32:13 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 03 19:32:13 launchpad ollama[157111]: INFO [main] model loaded | tid="139874576224256" timestamp=1741059133
+Mar 03 19:32:13 launchpad ollama[1578]: time=2025-03-03T19:32:13.134-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 03 19:32:19 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:32:19 | 200 |  7.977784337s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:35:47 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:35:47 | 200 |  6.735567135s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:38:39 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:38:39 | 200 |  6.162431839s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:40:49 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:40:49 | 200 |  5.969356111s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:44:19 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:44:19 | 200 |  5.933615782s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:48:50 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:48:50 | 200 |  5.946570658s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:51:25 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:51:25 | 200 |  5.526996601s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:52:54 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:52:54 | 200 |  5.919910557s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:54:24 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:54:24 | 200 |  6.540529442s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:58:45 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:58:45 | 200 |  7.570463036s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.246-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10131800064 required="6.2 GiB"
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.247-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.247-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.248-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33455"
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.248-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.248-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.248-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 03 20:03:57 launchpad ollama[161748]: INFO [main] build info | build=0 commit="unknown" tid="140155033649152" timestamp=1741061037
+Mar 03 20:03:57 launchpad ollama[161748]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140155033649152" timestamp=1741061037 total_threads=16
+Mar 03 20:03:57 launchpad ollama[161748]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33455" tid="140155033649152" timestamp=1741061037
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.499-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 03 20:03:57 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 03 20:03:57 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 03 20:03:57 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 03 20:03:57 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 03 20:03:58 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 03 20:03:58 launchpad ollama[161748]: INFO [main] model loaded | tid="140155033649152" timestamp=1741061038
+Mar 03 20:03:58 launchpad ollama[1578]: time=2025-03-03T20:03:58.252-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 03 20:04:04 launchpad ollama[1578]: [GIN] 2025/03/03 - 20:04:04 | 200 |   7.82306194s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 20:05:58 launchpad ollama[1578]: [GIN] 2025/03/03 - 20:05:58 | 200 |  5.359156714s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 20:10:15 launchpad ollama[1578]: [GIN] 2025/03/03 - 20:10:15 | 200 |  7.520659398s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 20:13:14 launchpad ollama[1578]: [GIN] 2025/03/03 - 20:13:14 | 200 |  5.873403815s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.444-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9873063936 required="6.2 GiB"
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.444-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.444-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.445-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41855"
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.445-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.445-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.445-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 12:31:52 launchpad ollama[194341]: INFO [main] build info | build=0 commit="unknown" tid="139834739843072" timestamp=1741120312
+Mar 04 12:31:52 launchpad ollama[194341]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139834739843072" timestamp=1741120312 total_threads=16
+Mar 04 12:31:52 launchpad ollama[194341]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41855" tid="139834739843072" timestamp=1741120312
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.697-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 12:31:52 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 12:31:52 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 12:31:52 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 12:31:52 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 12:31:53 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 12:31:53 launchpad ollama[194341]: INFO [main] model loaded | tid="139834739843072" timestamp=1741120313
+Mar 04 12:31:53 launchpad ollama[1578]: time=2025-03-04T12:31:53.450-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 12:32:02 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:32:02 | 200 |  9.805607255s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:33:35 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:33:35 | 200 | 12.595152534s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.595-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9874112512 required="6.2 GiB"
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.595-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.595-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.596-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37311"
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.596-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.596-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.596-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 12:38:48 launchpad ollama[195404]: INFO [main] build info | build=0 commit="unknown" tid="140578525372416" timestamp=1741120728
+Mar 04 12:38:48 launchpad ollama[195404]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140578525372416" timestamp=1741120728 total_threads=16
+Mar 04 12:38:48 launchpad ollama[195404]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37311" tid="140578525372416" timestamp=1741120728
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.848-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 12:38:48 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 12:38:48 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 12:38:48 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 12:38:48 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 12:38:49 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 12:38:49 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 12:38:49 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 12:38:49 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 12:38:49 launchpad ollama[195404]: INFO [main] model loaded | tid="140578525372416" timestamp=1741120729
+Mar 04 12:38:49 launchpad ollama[1578]: time=2025-03-04T12:38:49.601-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 12:38:59 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:38:59 | 200 | 11.390635217s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.718-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9873457152 required="6.2 GiB"
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.719-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.719-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.720-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34873"
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.720-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.720-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.720-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 12:45:06 launchpad ollama[196314]: INFO [main] build info | build=0 commit="unknown" tid="140391407271936" timestamp=1741121106
+Mar 04 12:45:06 launchpad ollama[196314]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140391407271936" timestamp=1741121106 total_threads=16
+Mar 04 12:45:06 launchpad ollama[196314]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34873" tid="140391407271936" timestamp=1741121106
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.972-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 12:45:06 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 12:45:06 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 12:45:06 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 12:45:06 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 12:45:07 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 12:45:07 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 12:45:07 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 12:45:07 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 12:45:07 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 12:45:07 launchpad ollama[196314]: INFO [main] model loaded | tid="140391407271936" timestamp=1741121107
+Mar 04 12:45:07 launchpad ollama[1578]: time=2025-03-04T12:45:07.726-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 04 12:45:10 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:45:10 | 200 |   3.63866554s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:45:42 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:45:42 | 200 |  6.699999806s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:47:56 launchpad ollama[196314]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1012 n_keep=24 n_left=2024 n_shift=1012 tid="140391407271936" timestamp=1741121276
+Mar 04 12:48:07 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:48:07 | 200 | 11.361400348s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:50:31 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:50:31 | 200 |  7.939433655s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.087-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9885777920 required="6.2 GiB"
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.087-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.088-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.089-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39039"
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.089-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.089-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.089-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 13:07:23 launchpad ollama[200031]: INFO [main] build info | build=0 commit="unknown" tid="140166450544640" timestamp=1741122443
+Mar 04 13:07:23 launchpad ollama[200031]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140166450544640" timestamp=1741122443 total_threads=16
+Mar 04 13:07:23 launchpad ollama[200031]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39039" tid="140166450544640" timestamp=1741122443
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.339-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 13:07:23 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 13:07:23 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 13:07:23 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 13:07:23 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 13:07:23 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 13:07:24 launchpad ollama[200031]: INFO [main] model loaded | tid="140166450544640" timestamp=1741122444
+Mar 04 13:07:24 launchpad ollama[1578]: time=2025-03-04T13:07:24.092-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 13:07:31 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:07:31 | 200 |  8.591458449s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.052-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9885908992 required="6.2 GiB"
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.052-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.052-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.053-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36843"
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.053-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.053-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.053-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 13:26:13 launchpad ollama[202775]: INFO [main] build info | build=0 commit="unknown" tid="139948605792256" timestamp=1741123573
+Mar 04 13:26:13 launchpad ollama[202775]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139948605792256" timestamp=1741123573 total_threads=16
+Mar 04 13:26:13 launchpad ollama[202775]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36843" tid="139948605792256" timestamp=1741123573
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.305-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 13:26:13 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 13:26:13 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 13:26:13 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 13:26:13 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 13:26:13 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 13:26:14 launchpad ollama[202775]: INFO [main] model loaded | tid="139948605792256" timestamp=1741123574
+Mar 04 13:26:14 launchpad ollama[1578]: time=2025-03-04T13:26:14.057-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 13:26:22 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:26:22 | 200 |  9.800858269s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.013-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9942859776 required="6.2 GiB"
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.013-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.013-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.014-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36987"
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.014-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.014-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.014-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 13:31:25 launchpad ollama[203752]: INFO [main] build info | build=0 commit="unknown" tid="139659855163392" timestamp=1741123885
+Mar 04 13:31:25 launchpad ollama[203752]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139659855163392" timestamp=1741123885 total_threads=16
+Mar 04 13:31:25 launchpad ollama[203752]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36987" tid="139659855163392" timestamp=1741123885
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.266-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 13:31:25 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 13:31:25 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 13:31:25 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 13:31:25 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 13:31:25 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 13:31:25 launchpad ollama[203752]: INFO [main] model loaded | tid="139659855163392" timestamp=1741123885
+Mar 04 13:31:26 launchpad ollama[1578]: time=2025-03-04T13:31:26.019-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 13:31:33 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:31:33 | 200 |  8.730906641s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:36:27 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:36:27 | 200 |  7.911130843s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:38:27 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:38:27 | 200 |  4.974464312s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.772-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9953083392 required="6.2 GiB"
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.772-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.772-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.773-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33513"
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.774-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.774-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.774-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 13:44:52 launchpad ollama[205875]: INFO [main] build info | build=0 commit="unknown" tid="140206603853824" timestamp=1741124692
+Mar 04 13:44:52 launchpad ollama[205875]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140206603853824" timestamp=1741124692 total_threads=16
+Mar 04 13:44:52 launchpad ollama[205875]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33513" tid="140206603853824" timestamp=1741124692
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 13:44:53 launchpad ollama[1578]: time=2025-03-04T13:44:53.025-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 13:44:53 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 13:44:53 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 13:44:53 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 13:44:53 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 13:44:53 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 13:44:53 launchpad ollama[205875]: INFO [main] model loaded | tid="140206603853824" timestamp=1741124693
+Mar 04 13:44:53 launchpad ollama[1578]: time=2025-03-04T13:44:53.778-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 13:45:01 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:45:01 | 200 |  8.964763487s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:48:43 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:48:43 | 200 |   6.12199874s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:51:48 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:51:48 | 200 |  5.077748233s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:55:25 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:55:25 | 200 |  4.528189605s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.855-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9737601024 required="6.2 GiB"
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.855-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.855-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.856-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36371"
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.856-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.856-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.856-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 15:15:47 launchpad ollama[220785]: INFO [main] build info | build=0 commit="unknown" tid="140468030492672" timestamp=1741130147
+Mar 04 15:15:47 launchpad ollama[220785]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140468030492672" timestamp=1741130147 total_threads=16
+Mar 04 15:15:47 launchpad ollama[220785]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36371" tid="140468030492672" timestamp=1741130147
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 15:15:48 launchpad ollama[1578]: time=2025-03-04T15:15:48.107-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 15:15:48 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 15:15:48 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 15:15:48 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 15:15:48 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 15:15:48 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 15:15:48 launchpad ollama[220785]: INFO [main] model loaded | tid="140468030492672" timestamp=1741130148
+Mar 04 15:15:48 launchpad ollama[1578]: time=2025-03-04T15:15:48.860-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 15:15:54 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:15:54 | 200 |  6.965924507s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:17:34 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:17:34 | 200 |   5.09332902s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:20:53 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:20:53 | 200 |  5.454866611s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:23:09 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:23:09 | 200 |  7.166144255s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:25:13 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:25:13 | 200 |  4.137473438s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:25:59 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:25:59 | 200 |  4.413648885s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:30:09 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:30:09 | 200 |  5.752504454s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:32:21 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:32:21 | 200 |  4.525342001s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:33:08 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:33:08 | 200 |  3.838820963s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:35:01 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:35:01 | 200 |   8.24367465s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:39:26 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:39:26 | 200 |  8.160357469s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:42:08 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:42:08 | 200 | 12.372227225s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:45:40 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:45:40 | 200 |  11.56729297s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.656-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9806938112 required="6.2 GiB"
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.656-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.656-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.657-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33095"
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.657-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.657-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.658-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 16:03:55 launchpad ollama[227948]: INFO [main] build info | build=0 commit="unknown" tid="140104411656192" timestamp=1741133035
+Mar 04 16:03:55 launchpad ollama[227948]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140104411656192" timestamp=1741133035 total_threads=16
+Mar 04 16:03:55 launchpad ollama[227948]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33095" tid="140104411656192" timestamp=1741133035
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.909-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 16:03:55 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 16:03:55 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 16:03:55 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 16:03:55 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 16:03:56 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 16:03:56 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 16:03:56 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 16:03:56 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 16:03:56 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 16:03:56 launchpad ollama[227948]: INFO [main] model loaded | tid="140104411656192" timestamp=1741133036
+Mar 04 16:03:56 launchpad ollama[1578]: time=2025-03-04T16:03:56.662-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 16:04:04 launchpad ollama[1578]: [GIN] 2025/03/04 - 16:04:04 | 200 |   9.37725366s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.207-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9806741504 required="6.2 GiB"
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.207-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.207-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.208-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42015"
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.208-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.208-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.208-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 16:09:33 launchpad ollama[228777]: INFO [main] build info | build=0 commit="unknown" tid="140637111410688" timestamp=1741133373
+Mar 04 16:09:33 launchpad ollama[228777]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140637111410688" timestamp=1741133373 total_threads=16
+Mar 04 16:09:33 launchpad ollama[228777]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42015" tid="140637111410688" timestamp=1741133373
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.459-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 16:09:33 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 16:09:33 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 16:09:33 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 16:09:33 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 16:09:34 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 16:09:34 launchpad ollama[228777]: INFO [main] model loaded | tid="140637111410688" timestamp=1741133374
+Mar 04 16:09:34 launchpad ollama[1578]: time=2025-03-04T16:09:34.212-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 16:09:41 launchpad ollama[1578]: [GIN] 2025/03/04 - 16:09:41 | 200 |  8.484282432s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 16:12:05 launchpad ollama[1578]: [GIN] 2025/03/04 - 16:12:05 | 200 |  6.186185362s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 16:15:49 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 04 16:15:51 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 04 16:15:51 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 04 16:15:51 launchpad systemd[1]: ollama.service: Consumed 10min 14.208s CPU time, 5.6G memory peak, 4.6G read from disk, 508.1M written to disk, 21.8M incoming IP traffic, 19.4M outgoing IP traffic.
+-- Boot 7ef76d8f1ea24523977a1c63a31424bf --
+Mar 04 16:16:37 launchpad systemd[1]: Starting Server for local large language models...
+Mar 04 16:16:37 launchpad systemd[1]: Started Server for local large language models.
+Mar 04 16:16:37 launchpad ollama[1574]: 2025/03/04 16:16:37 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 04 16:16:37 launchpad ollama[1574]: time=2025-03-04T16:16:37.511-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 04 16:16:37 launchpad ollama[1574]: time=2025-03-04T16:16:37.516-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 04 16:16:37 launchpad ollama[1574]: time=2025-03-04T16:16:37.516-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 04 16:16:37 launchpad ollama[1574]: time=2025-03-04T16:16:37.518-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1010747878/runners
+Mar 04 16:16:40 launchpad ollama[1574]: time=2025-03-04T16:16:40.519-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 04 16:16:40 launchpad ollama[1574]: time=2025-03-04T16:16:40.519-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 04 16:16:40 launchpad ollama[1574]: time=2025-03-04T16:16:40.519-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 04 16:16:40 launchpad ollama[1574]: time=2025-03-04T16:16:40.520-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 04 16:16:40 launchpad ollama[1574]: time=2025-03-04T16:16:40.520-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 04 16:16:40 launchpad ollama[1574]: time=2025-03-04T16:16:40.763-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 05 14:05:19 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:05:19 | 200 |    1.265493ms |       127.0.0.1 | HEAD     "/"
+Mar 05 14:05:19 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:05:19 | 200 |    16.01174ms |       127.0.0.1 | POST     "/api/show"
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.344-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9879683072 required="6.2 GiB"
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.344-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.344-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.345-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38609"
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.346-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.346-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.346-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 14:05:19 launchpad ollama[83604]: INFO [main] build info | build=0 commit="unknown" tid="139986878709760" timestamp=1741212319
+Mar 05 14:05:19 launchpad ollama[83604]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139986878709760" timestamp=1741212319 total_threads=16
+Mar 05 14:05:19 launchpad ollama[83604]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38609" tid="139986878709760" timestamp=1741212319
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.597-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 14:05:19 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 14:05:19 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 14:05:19 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 14:05:19 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 14:05:25 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 14:05:25 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 14:05:25 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 14:05:25 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 14:05:25 launchpad ollama[83604]: INFO [main] model loaded | tid="139986878709760" timestamp=1741212325
+Mar 05 14:05:25 launchpad ollama[1574]: time=2025-03-05T14:05:25.623-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.28 seconds"
+Mar 05 14:05:25 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:05:25 | 200 |  6.455429877s |       127.0.0.1 | POST     "/api/generate"
+Mar 05 14:06:09 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:06:09 | 200 |  8.624030612s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:08:13 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:08:13 | 200 |  6.101035016s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:10:03 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:10:03 | 200 |  7.784464299s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.651-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9946464256 required="6.2 GiB"
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.651-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.5 GiB" free_swap="68.9 GiB"
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.651-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.652-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39435"
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.652-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.652-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.652-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 14:17:19 launchpad ollama[85535]: INFO [main] build info | build=0 commit="unknown" tid="139944768126976" timestamp=1741213039
+Mar 05 14:17:19 launchpad ollama[85535]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139944768126976" timestamp=1741213039 total_threads=16
+Mar 05 14:17:19 launchpad ollama[85535]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39435" tid="139944768126976" timestamp=1741213039
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.903-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 14:17:19 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 14:17:19 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 14:17:19 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 14:17:19 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 14:17:20 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 14:17:20 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 14:17:20 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 14:17:20 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 14:17:20 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 14:17:20 launchpad ollama[85535]: INFO [main] model loaded | tid="139944768126976" timestamp=1741213040
+Mar 05 14:17:20 launchpad ollama[1574]: time=2025-03-05T14:17:20.656-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 14:17:30 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:17:30 | 200 | 10.994111478s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:21:33 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:21:33 | 200 | 12.040119362s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:24:51 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:24:51 | 200 |  9.340320565s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:27:26 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:27:26 | 200 |  9.939923048s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.357-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9950003200 required="6.2 GiB"
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.357-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.357-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.358-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34681"
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.358-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.358-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.358-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 14:34:42 launchpad ollama[88156]: INFO [main] build info | build=0 commit="unknown" tid="139831697833984" timestamp=1741214082
+Mar 05 14:34:42 launchpad ollama[88156]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139831697833984" timestamp=1741214082 total_threads=16
+Mar 05 14:34:42 launchpad ollama[88156]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34681" tid="139831697833984" timestamp=1741214082
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.609-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 14:34:42 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 14:34:42 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 14:34:42 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 14:34:42 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 14:34:43 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 14:34:43 launchpad ollama[88156]: INFO [main] model loaded | tid="139831697833984" timestamp=1741214083
+Mar 05 14:34:43 launchpad ollama[1574]: time=2025-03-05T14:34:43.363-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 14:34:52 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:34:52 | 200 |  9.988378986s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.690-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9952231424 required="6.2 GiB"
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.690-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.690-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.691-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44085"
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.691-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.691-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.691-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 14:46:44 launchpad ollama[90026]: INFO [main] build info | build=0 commit="unknown" tid="140117391159296" timestamp=1741214804
+Mar 05 14:46:44 launchpad ollama[90026]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140117391159296" timestamp=1741214804 total_threads=16
+Mar 05 14:46:44 launchpad ollama[90026]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44085" tid="140117391159296" timestamp=1741214804
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.942-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 14:46:44 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 14:46:44 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 14:46:44 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 14:46:44 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 14:46:45 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 14:46:45 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 14:46:45 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 14:46:45 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 14:46:45 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 14:46:45 launchpad ollama[90026]: INFO [main] model loaded | tid="140117391159296" timestamp=1741214805
+Mar 05 14:46:45 launchpad ollama[1574]: time=2025-03-05T14:46:45.696-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 14:46:52 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:46:52 | 200 |  8.298066043s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:50:48 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:50:48 | 200 |  8.191092759s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.983-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9947906048 required="6.2 GiB"
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.983-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.5 GiB" free_swap="68.9 GiB"
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.983-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.984-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37563"
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.984-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.984-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.984-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 15:00:07 launchpad ollama[92071]: INFO [main] build info | build=0 commit="unknown" tid="140347882876928" timestamp=1741215607
+Mar 05 15:00:07 launchpad ollama[92071]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140347882876928" timestamp=1741215607 total_threads=16
+Mar 05 15:00:07 launchpad ollama[92071]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37563" tid="140347882876928" timestamp=1741215607
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 15:00:07 launchpad ollama[1574]: time=2025-03-05T15:00:07.235-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 15:00:07 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 15:00:07 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 15:00:07 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 15:00:07 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 15:00:07 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 15:00:07 launchpad ollama[92071]: INFO [main] model loaded | tid="140347882876928" timestamp=1741215607
+Mar 05 15:00:07 launchpad ollama[1574]: time=2025-03-05T15:00:07.989-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 15:00:16 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:00:16 | 200 |  9.488140419s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.286-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9967173632 required="6.2 GiB"
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.286-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.286-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.287-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40239"
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.288-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.288-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.288-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 15:06:01 launchpad ollama[93480]: INFO [main] build info | build=0 commit="unknown" tid="139942513655808" timestamp=1741215961
+Mar 05 15:06:01 launchpad ollama[93480]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139942513655808" timestamp=1741215961 total_threads=16
+Mar 05 15:06:01 launchpad ollama[93480]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40239" tid="139942513655808" timestamp=1741215961
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.539-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 15:06:01 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 15:06:01 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 15:06:01 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 15:06:01 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 15:06:02 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 15:06:02 launchpad ollama[93480]: INFO [main] model loaded | tid="139942513655808" timestamp=1741215962
+Mar 05 15:06:02 launchpad ollama[1574]: time=2025-03-05T15:06:02.292-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 15:06:10 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:06:10 | 200 |  9.075859834s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.322-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9986179072 required="6.2 GiB"
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.322-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.323-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.323-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34679"
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.324-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.324-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.324-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 15:18:10 launchpad ollama[95509]: INFO [main] build info | build=0 commit="unknown" tid="139884592377856" timestamp=1741216690
+Mar 05 15:18:10 launchpad ollama[95509]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139884592377856" timestamp=1741216690 total_threads=16
+Mar 05 15:18:10 launchpad ollama[95509]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34679" tid="139884592377856" timestamp=1741216690
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.574-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 15:18:10 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 15:18:10 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 15:18:10 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 15:18:10 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 15:18:11 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 15:18:11 launchpad ollama[95509]: INFO [main] model loaded | tid="139884592377856" timestamp=1741216691
+Mar 05 15:18:11 launchpad ollama[1574]: time=2025-03-05T15:18:11.327-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 15:18:16 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:18:16 | 200 |  6.278557801s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:20:01 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:20:01 | 200 |  3.639772193s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:22:36 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:22:36 | 200 |  2.641605505s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.209-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9964879872 required="6.2 GiB"
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.209-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="68.9 GiB"
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.209-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.210-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34753"
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.210-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.210-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.211-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 15:29:43 launchpad ollama[97994]: INFO [main] build info | build=0 commit="unknown" tid="140433811144704" timestamp=1741217383
+Mar 05 15:29:43 launchpad ollama[97994]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140433811144704" timestamp=1741217383 total_threads=16
+Mar 05 15:29:43 launchpad ollama[97994]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34753" tid="140433811144704" timestamp=1741217383
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.462-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 15:29:43 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 15:29:43 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 15:29:43 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 15:29:43 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 15:29:44 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 15:29:44 launchpad ollama[97994]: INFO [main] model loaded | tid="140433811144704" timestamp=1741217384
+Mar 05 15:29:44 launchpad ollama[1574]: time=2025-03-05T15:29:44.214-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 15:29:49 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:29:49 | 200 |   6.96253847s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:31:12 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:31:12 | 200 |  5.408200654s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.279-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9963241472 required="6.2 GiB"
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.279-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.4 GiB" free_swap="68.9 GiB"
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.280-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.281-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37607"
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.281-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.281-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.281-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 15:45:10 launchpad ollama[100524]: INFO [main] build info | build=0 commit="unknown" tid="139922420887552" timestamp=1741218310
+Mar 05 15:45:10 launchpad ollama[100524]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139922420887552" timestamp=1741218310 total_threads=16
+Mar 05 15:45:10 launchpad ollama[100524]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37607" tid="139922420887552" timestamp=1741218310
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.532-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 15:45:10 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 15:45:10 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 15:45:10 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 15:45:10 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 15:45:11 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 15:45:11 launchpad ollama[100524]: INFO [main] model loaded | tid="139922420887552" timestamp=1741218311
+Mar 05 15:45:11 launchpad ollama[1574]: time=2025-03-05T15:45:11.285-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 15:45:17 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:45:17 | 200 |  7.786290161s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:50:04 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:50:04 | 200 |  6.974356031s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:51:54 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:51:54 | 200 |  5.587612998s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:53:05 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:53:05 | 200 |  7.752311164s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:55:45 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:55:45 | 200 |   5.73340252s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:59:21 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:59:21 | 200 |  3.732399405s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.727-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9962913792 required="6.2 GiB"
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.727-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.4 GiB" free_swap="68.9 GiB"
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.727-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.728-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39665"
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.728-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.728-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.728-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 16:05:25 launchpad ollama[103480]: INFO [main] build info | build=0 commit="unknown" tid="140386877947904" timestamp=1741219525
+Mar 05 16:05:25 launchpad ollama[103480]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140386877947904" timestamp=1741219525 total_threads=16
+Mar 05 16:05:25 launchpad ollama[103480]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39665" tid="140386877947904" timestamp=1741219525
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.979-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 16:05:26 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 16:05:26 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 16:05:26 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 16:05:26 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 16:05:26 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 16:05:26 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 16:05:26 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 16:05:26 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 16:05:26 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 16:05:26 launchpad ollama[103480]: INFO [main] model loaded | tid="140386877947904" timestamp=1741219526
+Mar 05 16:05:26 launchpad ollama[1574]: time=2025-03-05T16:05:26.732-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 16:05:32 launchpad ollama[1574]: [GIN] 2025/03/05 - 16:05:32 | 200 |  6.557560381s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.914-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9941549056 required="6.2 GiB"
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.914-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.3 GiB" free_swap="68.9 GiB"
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.914-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.915-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39547"
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.915-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.915-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.915-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 16:24:38 launchpad ollama[106740]: INFO [main] build info | build=0 commit="unknown" tid="140420897046528" timestamp=1741220678
+Mar 05 16:24:38 launchpad ollama[106740]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140420897046528" timestamp=1741220678 total_threads=16
+Mar 05 16:24:38 launchpad ollama[106740]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39547" tid="140420897046528" timestamp=1741220678
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 16:24:39 launchpad ollama[1574]: time=2025-03-05T16:24:39.167-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 16:24:39 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 16:24:39 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 16:24:39 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 16:24:39 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 16:24:39 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 16:24:39 launchpad ollama[106740]: INFO [main] model loaded | tid="140420897046528" timestamp=1741220679
+Mar 05 16:24:39 launchpad ollama[1574]: time=2025-03-05T16:24:39.920-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 05 16:24:47 launchpad ollama[1574]: [GIN] 2025/03/05 - 16:24:47 | 200 |  8.812184503s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 16:25:41 launchpad ollama[1574]: [GIN] 2025/03/05 - 16:25:41 | 200 |   6.36861991s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 16:30:10 launchpad ollama[1574]: [GIN] 2025/03/05 - 16:30:10 | 200 |  4.117289805s |       127.0.0.1 | POST     "/api/chat"
+Mar 07 09:11:26 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:11:26 | 200 |      16.604µs |       127.0.0.1 | HEAD     "/"
+Mar 07 09:11:26 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:11:26 | 200 |   13.376506ms |       127.0.0.1 | POST     "/api/show"
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.632-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9693429760 required="6.2 GiB"
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.633-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.633-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.634-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42183"
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.634-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.634-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.634-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 07 09:11:26 launchpad ollama[215099]: INFO [main] build info | build=0 commit="unknown" tid="140457191243776" timestamp=1741367486
+Mar 07 09:11:26 launchpad ollama[215099]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140457191243776" timestamp=1741367486 total_threads=16
+Mar 07 09:11:26 launchpad ollama[215099]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42183" tid="140457191243776" timestamp=1741367486
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.885-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 07 09:11:26 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 07 09:11:26 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 07 09:11:26 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 07 09:11:26 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 07 09:11:27 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 07 09:11:27 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 07 09:11:27 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 07 09:11:27 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 07 09:11:27 launchpad ollama[215099]: INFO [main] model loaded | tid="140457191243776" timestamp=1741367487
+Mar 07 09:11:27 launchpad ollama[1574]: time=2025-03-07T09:11:27.638-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 07 09:11:27 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:11:27 | 200 |  1.178096763s |       127.0.0.1 | POST     "/api/generate"
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.987-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9706602496 required="6.2 GiB"
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.987-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.987-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.988-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40441"
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.989-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.989-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.989-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 07 09:16:45 launchpad ollama[215936]: INFO [main] build info | build=0 commit="unknown" tid="140543773380608" timestamp=1741367805
+Mar 07 09:16:45 launchpad ollama[215936]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140543773380608" timestamp=1741367805 total_threads=16
+Mar 07 09:16:45 launchpad ollama[215936]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40441" tid="140543773380608" timestamp=1741367805
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 07 09:16:45 launchpad ollama[1574]: time=2025-03-07T09:16:45.239-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 07 09:16:45 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 07 09:16:45 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 07 09:16:45 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 07 09:16:45 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 07 09:16:45 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 07 09:16:45 launchpad ollama[215936]: INFO [main] model loaded | tid="140543773380608" timestamp=1741367805
+Mar 07 09:16:45 launchpad ollama[1574]: time=2025-03-07T09:16:45.994-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 07 09:16:55 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:16:55 | 200 | 10.219142304s |       127.0.0.1 | POST     "/api/chat"
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.641-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9727442944 required="6.2 GiB"
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.642-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.642-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.643-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40913"
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.643-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.643-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.643-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 07 09:32:27 launchpad ollama[218242]: INFO [main] build info | build=0 commit="unknown" tid="140230817394688" timestamp=1741368747
+Mar 07 09:32:27 launchpad ollama[218242]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140230817394688" timestamp=1741368747 total_threads=16
+Mar 07 09:32:27 launchpad ollama[218242]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40913" tid="140230817394688" timestamp=1741368747
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.895-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 07 09:32:27 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 07 09:32:27 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 07 09:32:27 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 07 09:32:27 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 07 09:32:28 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 07 09:32:28 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 07 09:32:28 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 07 09:32:28 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 07 09:32:28 launchpad ollama[218242]: INFO [main] model loaded | tid="140230817394688" timestamp=1741368748
+Mar 07 09:32:28 launchpad ollama[1574]: time=2025-03-07T09:32:28.649-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 07 09:32:37 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:32:37 | 200 |  9.888731563s |       127.0.0.1 | POST     "/api/chat"
+Mar 07 09:35:12 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:35:12 | 200 |  9.346114096s |       127.0.0.1 | POST     "/api/chat"
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.962-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9768140800 required="6.2 GiB"
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.962-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.962-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.963-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34641"
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.964-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.964-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.964-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 07 09:45:15 launchpad ollama[220114]: INFO [main] build info | build=0 commit="unknown" tid="140378217746432" timestamp=1741369515
+Mar 07 09:45:15 launchpad ollama[220114]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140378217746432" timestamp=1741369515 total_threads=16
+Mar 07 09:45:15 launchpad ollama[220114]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34641" tid="140378217746432" timestamp=1741369515
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 07 09:45:16 launchpad ollama[1574]: time=2025-03-07T09:45:16.215-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 07 09:45:16 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 07 09:45:16 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 07 09:45:16 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 07 09:45:16 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 07 09:45:16 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 07 09:45:16 launchpad ollama[220114]: INFO [main] model loaded | tid="140378217746432" timestamp=1741369516
+Mar 07 09:45:16 launchpad ollama[1574]: time=2025-03-07T09:45:16.968-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 07 09:45:25 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:45:25 | 200 |  9.994966063s |       127.0.0.1 | POST     "/api/chat"
+Mar 07 09:48:48 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:48:48 | 200 |  4.196691165s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:24:07 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:24:07 | 200 |       14.39µs |       127.0.0.1 | HEAD     "/"
+Mar 08 13:24:07 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:24:07 | 200 |   12.475666ms |       127.0.0.1 | POST     "/api/show"
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.942-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9611706368 required="6.2 GiB"
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.942-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.942-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.944-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40479"
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.944-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.944-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.944-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 13:24:07 launchpad ollama[326716]: INFO [main] build info | build=0 commit="unknown" tid="139759328415744" timestamp=1741469047
+Mar 08 13:24:07 launchpad ollama[326716]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139759328415744" timestamp=1741469047 total_threads=16
+Mar 08 13:24:07 launchpad ollama[326716]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40479" tid="139759328415744" timestamp=1741469047
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 13:24:08 launchpad ollama[1574]: time=2025-03-08T13:24:08.195-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 13:24:08 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 13:24:08 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 13:24:08 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 13:24:08 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 13:24:08 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 13:24:08 launchpad ollama[326716]: INFO [main] model loaded | tid="139759328415744" timestamp=1741469048
+Mar 08 13:24:08 launchpad ollama[1574]: time=2025-03-08T13:24:08.949-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 13:24:08 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:24:08 | 200 |  1.183460126s |       127.0.0.1 | POST     "/api/generate"
+Mar 08 13:27:47 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:27:47 | 200 |  1.094116677s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:28:25 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:28:25 | 200 |  4.521793374s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:29:51 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:29:51 | 200 |  7.506289437s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:32:15 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:32:15 | 200 |  6.663518074s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.636-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9673506816 required="6.2 GiB"
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.636-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.636-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.637-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33731"
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.637-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.637-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.637-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 13:37:16 launchpad ollama[328680]: INFO [main] build info | build=0 commit="unknown" tid="140209512603648" timestamp=1741469836
+Mar 08 13:37:16 launchpad ollama[328680]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140209512603648" timestamp=1741469836 total_threads=16
+Mar 08 13:37:16 launchpad ollama[328680]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33731" tid="140209512603648" timestamp=1741469836
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.889-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 13:37:16 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 13:37:16 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 13:37:16 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 13:37:16 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 13:37:17 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 13:37:17 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 13:37:17 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 13:37:17 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 13:37:17 launchpad ollama[328680]: INFO [main] model loaded | tid="140209512603648" timestamp=1741469837
+Mar 08 13:37:17 launchpad ollama[1574]: time=2025-03-08T13:37:17.643-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 08 13:37:28 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:37:28 | 200 | 12.164662172s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:42:00 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:42:00 | 200 |  2.284449518s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:43:15 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:43:15 | 200 |   1.23992856s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:44:55 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:44:55 | 200 |  4.428215426s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.810-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9673834496 required="6.2 GiB"
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.810-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.810-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.811-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46137"
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.811-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.811-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.811-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 13:51:47 launchpad ollama[330837]: INFO [main] build info | build=0 commit="unknown" tid="139948252450816" timestamp=1741470707
+Mar 08 13:51:47 launchpad ollama[330837]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139948252450816" timestamp=1741470707 total_threads=16
+Mar 08 13:51:47 launchpad ollama[330837]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46137" tid="139948252450816" timestamp=1741470707
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 13:51:48 launchpad ollama[1574]: time=2025-03-08T13:51:48.063-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 13:51:48 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 13:51:48 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 13:51:48 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 13:51:48 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 13:51:48 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 13:51:48 launchpad ollama[330837]: INFO [main] model loaded | tid="139948252450816" timestamp=1741470708
+Mar 08 13:51:48 launchpad ollama[1574]: time=2025-03-08T13:51:48.816-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 13:51:54 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:51:54 | 200 |  6.439826955s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.498-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9608953856 required="6.2 GiB"
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.498-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.498-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.499-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44841"
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.499-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.499-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.499-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 14:02:19 launchpad ollama[332538]: INFO [main] build info | build=0 commit="unknown" tid="140476464001024" timestamp=1741471339
+Mar 08 14:02:19 launchpad ollama[332538]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140476464001024" timestamp=1741471339 total_threads=16
+Mar 08 14:02:19 launchpad ollama[332538]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44841" tid="140476464001024" timestamp=1741471339
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.750-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 14:02:19 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 14:02:19 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 14:02:19 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 14:02:19 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 14:02:20 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 14:02:20 launchpad ollama[332538]: INFO [main] model loaded | tid="140476464001024" timestamp=1741471340
+Mar 08 14:02:20 launchpad ollama[1574]: time=2025-03-08T14:02:20.504-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 14:02:26 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:02:26 | 200 |  7.576100041s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:06:41 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:06:41 | 200 |  5.728637896s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:09:37 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:09:37 | 200 |  7.661746806s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:11:33 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:11:33 | 200 |  4.219460673s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:13:03 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:13:03 | 200 |  6.952883113s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.763-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9704505344 required="6.2 GiB"
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.764-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.764-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.765-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33217"
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.765-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.765-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.765-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 14:31:10 launchpad ollama[336876]: INFO [main] build info | build=0 commit="unknown" tid="140236733587456" timestamp=1741473070
+Mar 08 14:31:10 launchpad ollama[336876]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140236733587456" timestamp=1741473070 total_threads=16
+Mar 08 14:31:10 launchpad ollama[336876]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33217" tid="140236733587456" timestamp=1741473070
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 14:31:11 launchpad ollama[1574]: time=2025-03-08T14:31:11.016-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 14:31:11 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 14:31:11 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 14:31:11 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 14:31:11 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 14:31:11 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 14:31:11 launchpad ollama[336876]: INFO [main] model loaded | tid="140236733587456" timestamp=1741473071
+Mar 08 14:31:11 launchpad ollama[1574]: time=2025-03-08T14:31:11.769-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 14:31:21 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:31:21 | 200 |  11.16257887s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.305-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9704898560 required="6.2 GiB"
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.305-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.305-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.306-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36233"
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.306-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.306-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.307-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 14:36:39 launchpad ollama[337733]: INFO [main] build info | build=0 commit="unknown" tid="140642937167872" timestamp=1741473399
+Mar 08 14:36:39 launchpad ollama[337733]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140642937167872" timestamp=1741473399 total_threads=16
+Mar 08 14:36:39 launchpad ollama[337733]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36233" tid="140642937167872" timestamp=1741473399
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.558-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 14:36:39 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 14:36:39 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 14:36:39 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 14:36:39 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 14:36:40 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 14:36:40 launchpad ollama[337733]: INFO [main] model loaded | tid="140642937167872" timestamp=1741473400
+Mar 08 14:36:40 launchpad ollama[1574]: time=2025-03-08T14:36:40.311-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 14:36:46 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:36:46 | 200 |  7.386375645s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.578-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9705095168 required="6.2 GiB"
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.578-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.579-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.579-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36931"
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.580-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.580-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.580-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 14:47:33 launchpad ollama[339335]: INFO [main] build info | build=0 commit="unknown" tid="140399294124032" timestamp=1741474053
+Mar 08 14:47:33 launchpad ollama[339335]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140399294124032" timestamp=1741474053 total_threads=16
+Mar 08 14:47:33 launchpad ollama[339335]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36931" tid="140399294124032" timestamp=1741474053
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.831-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 14:47:33 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 14:47:33 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 14:47:33 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 14:47:33 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 14:47:34 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 14:47:34 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 14:47:34 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 14:47:34 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 14:47:34 launchpad ollama[339335]: INFO [main] model loaded | tid="140399294124032" timestamp=1741474054
+Mar 08 14:47:34 launchpad ollama[1574]: time=2025-03-08T14:47:34.584-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 14:47:43 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:47:43 | 200 |  10.29508951s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:50:56 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:50:56 | 200 |  8.405256104s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.312-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9687072768 required="6.2 GiB"
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.312-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.312-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.313-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37243"
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.313-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.313-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.313-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 15:01:28 launchpad ollama[341747]: INFO [main] build info | build=0 commit="unknown" tid="140708884348928" timestamp=1741474888
+Mar 08 15:01:28 launchpad ollama[341747]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140708884348928" timestamp=1741474888 total_threads=16
+Mar 08 15:01:28 launchpad ollama[341747]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37243" tid="140708884348928" timestamp=1741474888
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.565-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 15:01:28 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 15:01:28 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 15:01:28 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 15:01:28 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 15:01:29 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 15:01:29 launchpad ollama[341747]: INFO [main] model loaded | tid="140708884348928" timestamp=1741474889
+Mar 08 15:01:29 launchpad ollama[1574]: time=2025-03-08T15:01:29.317-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 15:01:35 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:01:35 | 200 |  6.908397259s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:05:13 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:05:13 | 200 |  2.445630051s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:05:40 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:05:40 | 200 |  5.419790376s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:09:28 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:09:28 | 200 |   6.92806316s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.053-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9687400448 required="6.2 GiB"
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.053-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.054-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.055-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46617"
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.055-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.055-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.055-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 15:15:38 launchpad ollama[343797]: INFO [main] build info | build=0 commit="unknown" tid="140497237594112" timestamp=1741475738
+Mar 08 15:15:38 launchpad ollama[343797]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140497237594112" timestamp=1741475738 total_threads=16
+Mar 08 15:15:38 launchpad ollama[343797]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46617" tid="140497237594112" timestamp=1741475738
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.306-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 15:15:38 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 15:15:38 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 15:15:38 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 15:15:38 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 15:15:38 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 15:15:39 launchpad ollama[343797]: INFO [main] model loaded | tid="140497237594112" timestamp=1741475739
+Mar 08 15:15:39 launchpad ollama[1574]: time=2025-03-08T15:15:39.060-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 15:15:47 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:15:47 | 200 |  9.394842213s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.791-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9692184576 required="6.2 GiB"
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.791-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.791-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.793-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42417"
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.793-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.793-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.793-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 15:24:59 launchpad ollama[345242]: INFO [main] build info | build=0 commit="unknown" tid="140415295508480" timestamp=1741476299
+Mar 08 15:24:59 launchpad ollama[345242]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140415295508480" timestamp=1741476299 total_threads=16
+Mar 08 15:24:59 launchpad ollama[345242]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42417" tid="140415295508480" timestamp=1741476299
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 15:25:00 launchpad ollama[1574]: time=2025-03-08T15:25:00.044-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 15:25:00 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 15:25:00 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 15:25:00 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 15:25:00 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 15:25:00 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 15:25:00 launchpad ollama[345242]: INFO [main] model loaded | tid="140415295508480" timestamp=1741476300
+Mar 08 15:25:00 launchpad ollama[1574]: time=2025-03-08T15:25:00.798-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 08 15:25:10 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:25:10 | 200 | 10.482071839s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.452-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9692119040 required="6.2 GiB"
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.452-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.452-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.453-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39793"
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.453-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.453-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.453-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 15:31:48 launchpad ollama[346269]: INFO [main] build info | build=0 commit="unknown" tid="139987294150656" timestamp=1741476708
+Mar 08 15:31:48 launchpad ollama[346269]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139987294150656" timestamp=1741476708 total_threads=16
+Mar 08 15:31:48 launchpad ollama[346269]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39793" tid="139987294150656" timestamp=1741476708
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.704-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 15:31:48 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 15:31:48 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 15:31:48 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 15:31:48 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 15:31:49 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 15:31:49 launchpad ollama[346269]: INFO [main] model loaded | tid="139987294150656" timestamp=1741476709
+Mar 08 15:31:49 launchpad ollama[1574]: time=2025-03-08T15:31:49.456-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 15:31:56 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:31:56 | 200 |  8.241596791s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:34:00 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:34:00 | 200 |  4.559451795s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.873-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9697034240 required="6.2 GiB"
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.873-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.874-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.875-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45995"
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.875-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.875-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.875-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 15:44:34 launchpad ollama[348127]: INFO [main] build info | build=0 commit="unknown" tid="139937916739584" timestamp=1741477474
+Mar 08 15:44:34 launchpad ollama[348127]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139937916739584" timestamp=1741477474 total_threads=16
+Mar 08 15:44:34 launchpad ollama[348127]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45995" tid="139937916739584" timestamp=1741477474
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 15:44:35 launchpad ollama[1574]: time=2025-03-08T15:44:35.126-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 15:44:35 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 15:44:35 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 15:44:35 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 15:44:35 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 15:44:35 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 15:44:35 launchpad ollama[348127]: INFO [main] model loaded | tid="139937916739584" timestamp=1741477475
+Mar 08 15:44:35 launchpad ollama[1574]: time=2025-03-08T15:44:35.879-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 15:44:43 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:44:43 | 200 |  8.892522358s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.638-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9696509952 required="6.2 GiB"
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.638-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.638-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.639-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42471"
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.639-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.639-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.639-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 15:53:36 launchpad ollama[349479]: INFO [main] build info | build=0 commit="unknown" tid="140647372591104" timestamp=1741478016
+Mar 08 15:53:36 launchpad ollama[349479]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140647372591104" timestamp=1741478016 total_threads=16
+Mar 08 15:53:36 launchpad ollama[349479]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42471" tid="140647372591104" timestamp=1741478016
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.890-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 15:53:36 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 15:53:36 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 15:53:36 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 15:53:36 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 15:53:37 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 15:53:37 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 15:53:37 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 15:53:37 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 15:53:37 launchpad ollama[349479]: INFO [main] model loaded | tid="140647372591104" timestamp=1741478017
+Mar 08 15:53:37 launchpad ollama[1574]: time=2025-03-08T15:53:37.642-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 15:53:47 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:53:47 | 200 | 10.553367802s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:58:00 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:58:00 | 200 | 11.997268489s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:00:39 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:00:39 | 200 |  7.990518084s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:02:15 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:02:15 | 200 | 10.239060648s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.043-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9697034240 required="6.2 GiB"
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.043-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.043-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.044-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36165"
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.044-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.044-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.044-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 16:09:30 launchpad ollama[351795]: INFO [main] build info | build=0 commit="unknown" tid="140092657197056" timestamp=1741478970
+Mar 08 16:09:30 launchpad ollama[351795]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140092657197056" timestamp=1741478970 total_threads=16
+Mar 08 16:09:30 launchpad ollama[351795]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36165" tid="140092657197056" timestamp=1741478970
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.295-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 16:09:30 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 16:09:30 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 16:09:30 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 16:09:30 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 16:09:30 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 16:09:31 launchpad ollama[351795]: INFO [main] model loaded | tid="140092657197056" timestamp=1741478971
+Mar 08 16:09:31 launchpad ollama[1574]: time=2025-03-08T16:09:31.048-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 16:09:41 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:09:41 | 200 | 11.183100395s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.945-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9696509952 required="6.2 GiB"
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.945-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.946-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.946-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39539"
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.947-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.947-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.947-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 16:19:07 launchpad ollama[353210]: INFO [main] build info | build=0 commit="unknown" tid="140416039989248" timestamp=1741479547
+Mar 08 16:19:07 launchpad ollama[353210]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140416039989248" timestamp=1741479547 total_threads=16
+Mar 08 16:19:07 launchpad ollama[353210]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39539" tid="140416039989248" timestamp=1741479547
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 16:19:08 launchpad ollama[1574]: time=2025-03-08T16:19:08.197-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 16:19:08 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 16:19:08 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 16:19:08 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 16:19:08 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 16:19:08 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 16:19:08 launchpad ollama[353210]: INFO [main] model loaded | tid="140416039989248" timestamp=1741479548
+Mar 08 16:19:08 launchpad ollama[1574]: time=2025-03-08T16:19:08.951-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 16:19:18 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:19:18 | 200 | 10.486038057s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.577-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9696837632 required="6.2 GiB"
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.577-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.577-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.578-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41655"
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.578-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.578-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.578-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 16:32:36 launchpad ollama[355180]: INFO [main] build info | build=0 commit="unknown" tid="140339924946944" timestamp=1741480356
+Mar 08 16:32:36 launchpad ollama[355180]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140339924946944" timestamp=1741480356 total_threads=16
+Mar 08 16:32:36 launchpad ollama[355180]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41655" tid="140339924946944" timestamp=1741480356
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.829-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 16:32:36 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 16:32:36 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 16:32:36 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 16:32:36 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 16:32:37 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 16:32:37 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 16:32:37 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 16:32:37 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 16:32:37 launchpad ollama[355180]: INFO [main] model loaded | tid="140339924946944" timestamp=1741480357
+Mar 08 16:32:37 launchpad ollama[1574]: time=2025-03-08T16:32:37.583-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 16:32:47 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:32:47 | 200 | 11.114139548s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:37:25 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:37:25 | 200 |  9.219364707s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.048-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9702342656 required="6.2 GiB"
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.048-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.048-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.049-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37857"
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.049-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.049-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.050-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 16:45:42 launchpad ollama[357133]: INFO [main] build info | build=0 commit="unknown" tid="140671976652800" timestamp=1741481142
+Mar 08 16:45:42 launchpad ollama[357133]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140671976652800" timestamp=1741481142 total_threads=16
+Mar 08 16:45:42 launchpad ollama[357133]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37857" tid="140671976652800" timestamp=1741481142
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.300-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 16:45:42 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 16:45:42 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 16:45:42 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 16:45:42 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 16:45:42 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 16:45:43 launchpad ollama[357133]: INFO [main] model loaded | tid="140671976652800" timestamp=1741481143
+Mar 08 16:45:43 launchpad ollama[1574]: time=2025-03-08T16:45:43.054-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 16:45:52 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:45:52 | 200 | 10.725293793s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.031-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9702277120 required="6.2 GiB"
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.031-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.031-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.032-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37797"
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.032-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.032-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.032-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 16:54:26 launchpad ollama[358418]: INFO [main] build info | build=0 commit="unknown" tid="140605006110720" timestamp=1741481666
+Mar 08 16:54:26 launchpad ollama[358418]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140605006110720" timestamp=1741481666 total_threads=16
+Mar 08 16:54:26 launchpad ollama[358418]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37797" tid="140605006110720" timestamp=1741481666
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.284-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 16:54:26 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 16:54:26 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 16:54:26 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 16:54:26 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 16:54:26 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 16:54:27 launchpad ollama[358418]: INFO [main] model loaded | tid="140605006110720" timestamp=1741481667
+Mar 08 16:54:27 launchpad ollama[1574]: time=2025-03-08T16:54:27.038-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 08 16:54:37 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:54:37 | 200 | 11.798630458s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.070-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9657909248 required="6.2 GiB"
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.070-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.070-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.071-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37719"
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.071-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.071-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.071-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 17:05:00 launchpad ollama[360343]: INFO [main] build info | build=0 commit="unknown" tid="140412091461632" timestamp=1741482300
+Mar 08 17:05:00 launchpad ollama[360343]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140412091461632" timestamp=1741482300 total_threads=16
+Mar 08 17:05:00 launchpad ollama[360343]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37719" tid="140412091461632" timestamp=1741482300
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.322-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 17:05:00 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 17:05:00 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 17:05:00 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 17:05:00 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 17:05:00 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 17:05:01 launchpad ollama[360343]: INFO [main] model loaded | tid="140412091461632" timestamp=1741482301
+Mar 08 17:05:01 launchpad ollama[1574]: time=2025-03-08T17:05:01.075-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 17:05:10 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:05:10 | 200 | 11.046435631s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:08:22 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:08:22 | 200 |  7.973867747s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:12:24 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:12:24 | 200 |  8.143167313s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:13:42 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:13:42 | 200 | 10.419543506s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:17:01 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:17:01 | 200 |  9.017646314s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:18:36 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:18:36 | 200 |  7.896930732s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:19:37 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:19:37 | 200 |  9.521671874s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:22:54 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:22:54 | 200 |  7.680583303s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:27:08 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:27:08 | 200 |  8.928069267s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:28:26 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:28:26 | 200 |   7.56655869s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:31:04 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:31:04 | 200 | 10.575788818s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.226-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9706471424 required="6.2 GiB"
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.226-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.226-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.227-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36051"
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.227-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.228-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.228-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 17:53:12 launchpad ollama[367363]: INFO [main] build info | build=0 commit="unknown" tid="139895185006592" timestamp=1741485192
+Mar 08 17:53:12 launchpad ollama[367363]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139895185006592" timestamp=1741485192 total_threads=16
+Mar 08 17:53:12 launchpad ollama[367363]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36051" tid="139895185006592" timestamp=1741485192
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.478-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 17:53:12 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 17:53:12 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 17:53:12 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 17:53:12 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 17:53:13 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 17:53:13 launchpad ollama[367363]: INFO [main] model loaded | tid="139895185006592" timestamp=1741485193
+Mar 08 17:53:13 launchpad ollama[1574]: time=2025-03-08T17:53:13.231-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 17:53:17 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:53:17 | 200 |  5.336332001s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:55:09 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:55:09 | 200 |  2.911486641s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:57:36 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:57:36 | 200 |   3.86944111s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:18:45 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:18:45 | 200 |      15.084µs |       127.0.0.1 | HEAD     "/"
+Mar 11 13:18:45 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:18:45 | 200 |   12.645877ms |       127.0.0.1 | POST     "/api/show"
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.472-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9554821120 required="6.2 GiB"
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.472-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.0 GiB" free_swap="68.9 GiB"
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.472-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.474-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45623"
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.474-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.474-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.474-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 13:18:45 launchpad ollama[480392]: INFO [main] build info | build=0 commit="unknown" tid="140425920761856" timestamp=1741724325
+Mar 11 13:18:45 launchpad ollama[480392]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140425920761856" timestamp=1741724325 total_threads=16
+Mar 11 13:18:45 launchpad ollama[480392]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45623" tid="140425920761856" timestamp=1741724325
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.725-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 13:18:45 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 13:18:45 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 13:18:45 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 13:18:45 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 13:18:46 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 13:18:46 launchpad ollama[480392]: INFO [main] model loaded | tid="140425920761856" timestamp=1741724326
+Mar 11 13:18:46 launchpad ollama[1574]: time=2025-03-11T13:18:46.478-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 13:18:46 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:18:46 | 200 |  1.178127813s |       127.0.0.1 | POST     "/api/generate"
+Mar 11 13:20:50 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:20:50 | 200 |  8.295163398s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.049-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9596895232 required="6.2 GiB"
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.049-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.050-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.051-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39123"
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.051-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.051-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.051-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 13:26:04 launchpad ollama[481615]: INFO [main] build info | build=0 commit="unknown" tid="140584829214720" timestamp=1741724764
+Mar 11 13:26:04 launchpad ollama[481615]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140584829214720" timestamp=1741724764 total_threads=16
+Mar 11 13:26:04 launchpad ollama[481615]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39123" tid="140584829214720" timestamp=1741724764
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.302-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 13:26:04 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 13:26:04 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 13:26:04 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 13:26:04 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 13:26:04 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 13:26:05 launchpad ollama[481615]: INFO [main] model loaded | tid="140584829214720" timestamp=1741724765
+Mar 11 13:26:05 launchpad ollama[1574]: time=2025-03-11T13:26:05.055-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 13:26:16 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:26:16 | 200 | 12.615909649s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.948-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9665445888 required="6.2 GiB"
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.948-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.9 GiB" free_swap="68.9 GiB"
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.948-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.949-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34005"
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.949-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.949-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.949-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 13:34:35 launchpad ollama[482865]: INFO [main] build info | build=0 commit="unknown" tid="139711692562432" timestamp=1741725275
+Mar 11 13:34:35 launchpad ollama[482865]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139711692562432" timestamp=1741725275 total_threads=16
+Mar 11 13:34:35 launchpad ollama[482865]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34005" tid="139711692562432" timestamp=1741725275
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 13:34:36 launchpad ollama[1574]: time=2025-03-11T13:34:36.201-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 13:34:36 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 13:34:36 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 13:34:36 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 13:34:36 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 13:34:36 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 13:34:36 launchpad ollama[482865]: INFO [main] model loaded | tid="139711692562432" timestamp=1741725276
+Mar 11 13:34:36 launchpad ollama[1574]: time=2025-03-11T13:34:36.953-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 13:34:48 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:34:48 | 200 | 12.905559978s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:36:27 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:36:27 | 200 |  8.950251273s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:39:04 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:39:04 | 200 | 13.612695266s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.595-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9665314816 required="6.2 GiB"
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.595-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.9 GiB" free_swap="68.9 GiB"
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.595-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.596-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44719"
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.596-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.596-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.596-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 13:49:58 launchpad ollama[485128]: INFO [main] build info | build=0 commit="unknown" tid="139822015393792" timestamp=1741726198
+Mar 11 13:49:58 launchpad ollama[485128]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139822015393792" timestamp=1741726198 total_threads=16
+Mar 11 13:49:58 launchpad ollama[485128]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44719" tid="139822015393792" timestamp=1741726198
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.847-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 13:49:58 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 13:49:58 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 13:49:58 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 13:49:58 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 13:49:59 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 13:49:59 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 13:49:59 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 13:49:59 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 13:49:59 launchpad ollama[485128]: INFO [main] model loaded | tid="139822015393792" timestamp=1741726199
+Mar 11 13:49:59 launchpad ollama[1574]: time=2025-03-11T13:49:59.600-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 13:50:12 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:50:12 | 200 | 13.774786975s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.885-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9661054976 required="6.2 GiB"
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.885-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.9 GiB" free_swap="68.9 GiB"
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.885-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.886-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42067"
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.886-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.886-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.887-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 13:58:05 launchpad ollama[486611]: INFO [main] build info | build=0 commit="unknown" tid="139867479465984" timestamp=1741726685
+Mar 11 13:58:05 launchpad ollama[486611]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139867479465984" timestamp=1741726685 total_threads=16
+Mar 11 13:58:05 launchpad ollama[486611]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42067" tid="139867479465984" timestamp=1741726685
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 13:58:06 launchpad ollama[1574]: time=2025-03-11T13:58:06.138-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 13:58:06 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 13:58:06 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 13:58:06 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 13:58:06 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 13:58:06 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 13:58:06 launchpad ollama[486611]: INFO [main] model loaded | tid="139867479465984" timestamp=1741726686
+Mar 11 13:58:06 launchpad ollama[1574]: time=2025-03-11T13:58:06.890-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 13:58:16 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:58:16 | 200 |  10.61464432s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:00:41 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:00:41 | 200 |  7.264593476s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:03:29 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:03:29 | 200 | 12.950179046s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.912-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9669115904 required="6.2 GiB"
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.912-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.9 GiB" free_swap="68.9 GiB"
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.912-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.913-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 32801"
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.913-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.913-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.913-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 14:10:05 launchpad ollama[488380]: INFO [main] build info | build=0 commit="unknown" tid="140076242849792" timestamp=1741727405
+Mar 11 14:10:05 launchpad ollama[488380]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140076242849792" timestamp=1741727405 total_threads=16
+Mar 11 14:10:05 launchpad ollama[488380]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32801" tid="140076242849792" timestamp=1741727405
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 14:10:06 launchpad ollama[1574]: time=2025-03-11T14:10:06.165-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 14:10:06 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 14:10:06 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 14:10:06 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 14:10:06 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 14:10:06 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 14:10:06 launchpad ollama[488380]: INFO [main] model loaded | tid="140076242849792" timestamp=1741727406
+Mar 11 14:10:06 launchpad ollama[1574]: time=2025-03-11T14:10:06.917-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 14:10:12 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:10:12 | 200 |  6.487016241s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:14:25 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:14:25 | 200 |  4.391287277s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:16:10 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:16:10 | 200 |  5.606884637s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:18:51 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:18:51 | 200 | 21.000459759s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:22:53 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:22:53 | 200 | 11.938722795s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.177-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9665249280 required="6.2 GiB"
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.177-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.177-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.178-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46727"
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.178-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.178-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.178-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 14:31:15 launchpad ollama[491723]: INFO [main] build info | build=0 commit="unknown" tid="140683859144704" timestamp=1741728675
+Mar 11 14:31:15 launchpad ollama[491723]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140683859144704" timestamp=1741728675 total_threads=16
+Mar 11 14:31:15 launchpad ollama[491723]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46727" tid="140683859144704" timestamp=1741728675
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.429-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 14:31:15 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 14:31:15 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 14:31:15 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 14:31:15 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 14:31:16 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 14:31:16 launchpad ollama[491723]: INFO [main] model loaded | tid="140683859144704" timestamp=1741728676
+Mar 11 14:31:16 launchpad ollama[1574]: time=2025-03-11T14:31:16.181-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 14:31:29 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:31:29 | 200 | 14.558833703s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:35:18 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:35:18 | 200 |  10.25274678s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:39:40 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:39:40 | 200 | 10.013498147s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.096-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9664397312 required="6.2 GiB"
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.096-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.097-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.098-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38937"
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.098-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.098-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.098-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 14:44:57 launchpad ollama[493743]: INFO [main] build info | build=0 commit="unknown" tid="140039000276992" timestamp=1741729497
+Mar 11 14:44:57 launchpad ollama[493743]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140039000276992" timestamp=1741729497 total_threads=16
+Mar 11 14:44:57 launchpad ollama[493743]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38937" tid="140039000276992" timestamp=1741729497
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.349-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 14:44:57 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 14:44:57 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 14:44:57 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 14:44:57 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 14:44:58 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 14:44:58 launchpad ollama[493743]: INFO [main] model loaded | tid="140039000276992" timestamp=1741729498
+Mar 11 14:44:58 launchpad ollama[1574]: time=2025-03-11T14:44:58.102-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 14:45:11 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:45:11 | 200 | 14.201247672s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:47:39 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:47:39 | 200 | 11.686216522s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:49:25 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:49:25 | 200 |  2.710858779s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:50:19 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:50:19 | 200 |  4.865790386s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:51:29 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:51:29 | 200 |  9.980005186s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:52:49 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:52:49 | 200 |  9.339118759s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:53:47 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:53:47 | 200 |  6.452323377s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:57:59 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:57:59 | 200 | 12.411862801s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.042-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9669378048 required="6.2 GiB"
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.042-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.0 GiB" free_swap="68.9 GiB"
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.043-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.044-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39977"
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.044-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.044-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.044-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 15:03:18 launchpad ollama[496459]: INFO [main] build info | build=0 commit="unknown" tid="140604860256256" timestamp=1741730598
+Mar 11 15:03:18 launchpad ollama[496459]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140604860256256" timestamp=1741730598 total_threads=16
+Mar 11 15:03:18 launchpad ollama[496459]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39977" tid="140604860256256" timestamp=1741730598
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.294-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 15:03:18 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 15:03:18 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 15:03:18 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 15:03:18 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 15:03:18 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 15:03:19 launchpad ollama[496459]: INFO [main] model loaded | tid="140604860256256" timestamp=1741730599
+Mar 11 15:03:19 launchpad ollama[1574]: time=2025-03-11T15:03:19.047-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 15:03:31 launchpad ollama[1574]: [GIN] 2025/03/11 - 15:03:31 | 200 | 14.049275516s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 15:07:23 launchpad ollama[1574]: [GIN] 2025/03/11 - 15:07:23 | 200 |  6.792752956s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 15:11:30 launchpad ollama[1574]: [GIN] 2025/03/11 - 15:11:30 | 200 |  5.378376894s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 15:13:16 launchpad ollama[1574]: [GIN] 2025/03/11 - 15:13:16 | 200 | 10.550209232s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 15:16:10 launchpad ollama[1574]: [GIN] 2025/03/11 - 15:16:10 | 200 |  4.154373675s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 16:50:25 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 11 16:50:25 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 11 16:50:25 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 11 16:50:25 launchpad systemd[1]: ollama.service: Consumed 16min 11.428s CPU time, 5.7G memory peak, 4.6G read from disk, 508.1M written to disk, 26.6M incoming IP traffic, 26.3M outgoing IP traffic.
+-- Boot 209de18b3b8242eb9551bbe34958d7c3 --
+Mar 12 10:23:02 launchpad systemd[1]: Starting Server for local large language models...
+Mar 12 10:23:02 launchpad systemd[1]: Started Server for local large language models.
+Mar 12 10:23:02 launchpad ollama[1578]: 2025/03/12 10:23:02 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 12 10:23:02 launchpad ollama[1578]: time=2025-03-12T10:23:02.672-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 12 10:23:02 launchpad ollama[1578]: time=2025-03-12T10:23:02.677-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 12 10:23:02 launchpad ollama[1578]: time=2025-03-12T10:23:02.680-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 12 10:23:02 launchpad ollama[1578]: time=2025-03-12T10:23:02.681-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1906912190/runners
+Mar 12 10:23:05 launchpad ollama[1578]: time=2025-03-12T10:23:05.609-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 12 10:23:05 launchpad ollama[1578]: time=2025-03-12T10:23:05.610-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 12 10:23:05 launchpad ollama[1578]: time=2025-03-12T10:23:05.611-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:23:05 launchpad ollama[1578]: time=2025-03-12T10:23:05.611-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:23:05 launchpad ollama[1578]: time=2025-03-12T10:23:05.611-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:23:05 launchpad ollama[1578]: time=2025-03-12T10:23:05.840-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 12 10:23:25 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 12 10:23:25 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 12 10:23:25 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 12 10:23:25 launchpad systemd[1]: ollama.service: Consumed 3.346s CPU time, 787M memory peak, 234.4M read from disk, 508.1M written to disk.
+-- Boot b91eb1767cd4479e933a82eeca6a29f4 --
+Mar 12 10:23:56 launchpad systemd[1]: Starting Server for local large language models...
+Mar 12 10:23:56 launchpad systemd[1]: Started Server for local large language models.
+Mar 12 10:23:56 launchpad ollama[1580]: 2025/03/12 10:23:56 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 12 10:23:56 launchpad ollama[1580]: time=2025-03-12T10:23:56.400-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 12 10:23:56 launchpad ollama[1580]: time=2025-03-12T10:23:56.405-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 12 10:23:56 launchpad ollama[1580]: time=2025-03-12T10:23:56.407-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 12 10:23:56 launchpad ollama[1580]: time=2025-03-12T10:23:56.411-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3159489295/runners
+Mar 12 10:23:59 launchpad ollama[1580]: time=2025-03-12T10:23:59.409-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Mar 12 10:23:59 launchpad ollama[1580]: time=2025-03-12T10:23:59.410-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 12 10:23:59 launchpad ollama[1580]: time=2025-03-12T10:23:59.410-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:23:59 launchpad ollama[1580]: time=2025-03-12T10:23:59.411-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:23:59 launchpad ollama[1580]: time=2025-03-12T10:23:59.411-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:23:59 launchpad ollama[1580]: time=2025-03-12T10:23:59.653-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 12 10:24:28 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 12 10:24:29 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 12 10:24:29 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 12 10:24:29 launchpad systemd[1]: ollama.service: Consumed 3.400s CPU time, 787.5M memory peak, 234.4M read from disk, 508.1M written to disk.
+-- Boot 40aecf6c345c46cd88d74fc846eed1f8 --
+Mar 12 10:25:07 launchpad systemd[1]: Starting Server for local large language models...
+Mar 12 10:25:07 launchpad systemd[1]: Started Server for local large language models.
+Mar 12 10:25:07 launchpad ollama[1576]: 2025/03/12 10:25:07 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 12 10:25:07 launchpad ollama[1576]: time=2025-03-12T10:25:07.456-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 12 10:25:07 launchpad ollama[1576]: time=2025-03-12T10:25:07.461-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 12 10:25:07 launchpad ollama[1576]: time=2025-03-12T10:25:07.462-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 12 10:25:07 launchpad ollama[1576]: time=2025-03-12T10:25:07.464-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1772389787/runners
+Mar 12 10:25:10 launchpad ollama[1576]: time=2025-03-12T10:25:10.454-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 12 10:25:10 launchpad ollama[1576]: time=2025-03-12T10:25:10.455-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 12 10:25:10 launchpad ollama[1576]: time=2025-03-12T10:25:10.455-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:25:10 launchpad ollama[1576]: time=2025-03-12T10:25:10.456-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:25:10 launchpad ollama[1576]: time=2025-03-12T10:25:10.456-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:25:10 launchpad ollama[1576]: time=2025-03-12T10:25:10.711-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 12 15:49:11 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 12 15:49:12 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 12 15:49:12 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 12 15:49:12 launchpad systemd[1]: ollama.service: Consumed 4.112s CPU time, 787.8M memory peak, 234.1M read from disk, 508.1M written to disk.
+Mar 12 15:49:17 launchpad systemd[1]: Starting Server for local large language models...
+Mar 12 15:49:18 launchpad systemd[1]: Started Server for local large language models.
+Mar 12 15:49:18 launchpad ollama[171362]: 2025/03/12 15:49:18 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 12 15:49:18 launchpad ollama[171362]: time=2025-03-12T15:49:18.055-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 12 15:49:18 launchpad ollama[171362]: time=2025-03-12T15:49:18.055-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 12 15:49:18 launchpad ollama[171362]: time=2025-03-12T15:49:18.055-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 12 15:49:18 launchpad ollama[171362]: time=2025-03-12T15:49:18.056-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2630945923/runners
+Mar 12 15:49:21 launchpad ollama[171362]: time=2025-03-12T15:49:21.141-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Mar 12 15:49:21 launchpad ollama[171362]: time=2025-03-12T15:49:21.142-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 12 15:49:21 launchpad ollama[171362]: time=2025-03-12T15:49:21.142-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:49:21 launchpad ollama[171362]: time=2025-03-12T15:49:21.142-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:49:21 launchpad ollama[171362]: time=2025-03-12T15:49:21.142-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:49:21 launchpad ollama[171362]: time=2025-03-12T15:49:21.373-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="8.9 GiB"
+Mar 12 15:54:24 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 12 15:54:24 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 12 15:54:24 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 12 15:54:24 launchpad systemd[1]: ollama.service: Consumed 3.556s CPU time, 552.4M memory peak, 508.1M written to disk.
+-- Boot 603fdba76a074e8aafa0425aa051a545 --
+Mar 12 15:55:06 launchpad systemd[1]: Starting Server for local large language models...
+Mar 12 15:55:06 launchpad systemd[1]: Started Server for local large language models.
+Mar 12 15:55:06 launchpad ollama[1582]: 2025/03/12 15:55:06 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 12 15:55:06 launchpad ollama[1582]: time=2025-03-12T15:55:06.198-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 12 15:55:06 launchpad ollama[1582]: time=2025-03-12T15:55:06.203-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 12 15:55:06 launchpad ollama[1582]: time=2025-03-12T15:55:06.204-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 12 15:55:06 launchpad ollama[1582]: time=2025-03-12T15:55:06.205-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama878283110/runners
+Mar 12 15:55:09 launchpad ollama[1582]: time=2025-03-12T15:55:09.184-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Mar 12 15:55:09 launchpad ollama[1582]: time=2025-03-12T15:55:09.185-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 12 15:55:09 launchpad ollama[1582]: time=2025-03-12T15:55:09.185-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:55:09 launchpad ollama[1582]: time=2025-03-12T15:55:09.185-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:55:09 launchpad ollama[1582]: time=2025-03-12T15:55:09.185-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:55:10 launchpad ollama[1582]: time=2025-03-12T15:55:10.952-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Mar 12 15:56:25 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 12 15:56:25 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 12 15:56:25 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 12 15:56:25 launchpad systemd[1]: ollama.service: Consumed 5.346s CPU time, 786.6M memory peak, 233.6M read from disk, 508.1M written to disk.
+-- Boot 0ea312c34cf84f328f700fef1b38b368 --
+Mar 12 15:57:00 launchpad systemd[1]: Starting Server for local large language models...
+Mar 12 15:57:00 launchpad systemd[1]: Started Server for local large language models.
+Mar 12 15:57:00 launchpad ollama[1583]: 2025/03/12 15:57:00 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 12 15:57:00 launchpad ollama[1583]: time=2025-03-12T15:57:00.992-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 12 15:57:00 launchpad ollama[1583]: time=2025-03-12T15:57:00.997-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 12 15:57:00 launchpad ollama[1583]: time=2025-03-12T15:57:00.998-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 12 15:57:01 launchpad ollama[1583]: time=2025-03-12T15:57:01.000-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1197272828/runners
+Mar 12 15:57:04 launchpad ollama[1583]: time=2025-03-12T15:57:04.015-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Mar 12 15:57:04 launchpad ollama[1583]: time=2025-03-12T15:57:04.015-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 12 15:57:04 launchpad ollama[1583]: time=2025-03-12T15:57:04.016-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:57:04 launchpad ollama[1583]: time=2025-03-12T15:57:04.016-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:57:04 launchpad ollama[1583]: time=2025-03-12T15:57:04.016-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:57:04 launchpad ollama[1583]: time=2025-03-12T15:57:04.257-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 13 11:00:04 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:00:04 | 200 |     597.248µs |       127.0.0.1 | HEAD     "/"
+Mar 13 11:00:04 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:00:04 | 200 |   16.375027ms |       127.0.0.1 | POST     "/api/show"
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.250-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9191292928 required="6.2 GiB"
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.250-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.251-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.253-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1197272828/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33525"
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.253-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.253-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.253-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 13 11:00:04 launchpad ollama[48847]: INFO [main] build info | build=0 commit="unknown" tid="140290989002752" timestamp=1741888804
+Mar 13 11:00:04 launchpad ollama[48847]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140290989002752" timestamp=1741888804 total_threads=16
+Mar 13 11:00:04 launchpad ollama[48847]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33525" tid="140290989002752" timestamp=1741888804
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - type  f32:   65 tensors
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - type q4_0:  225 tensors
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - type q6_K:    1 tensors
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.504-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_vocab: special tokens cache size = 256
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: arch             = llama
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: vocab type       = BPE
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_vocab          = 128256
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_merges         = 280147
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: vocab_only       = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_embd           = 4096
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_layer          = 32
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_head           = 32
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_head_kv        = 8
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_rot            = 128
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_swa            = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_gqa            = 4
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_ff             = 14336
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_expert         = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_expert_used    = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: causal attn      = 1
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: pooling type     = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: rope type        = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: rope scaling     = linear
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: freq_scale_train = 1
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: ssm_d_state      = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: model type       = 8B
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: model ftype      = Q4_0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: model params     = 8.03 B
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: max token length = 256
+Mar 13 11:00:04 launchpad ollama[1583]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 13 11:00:04 launchpad ollama[1583]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 13 11:00:04 launchpad ollama[1583]: ggml_cuda_init: found 1 CUDA devices:
+Mar 13 11:00:04 launchpad ollama[1583]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 13 11:00:09 launchpad ollama[1583]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 13 11:00:09 launchpad ollama[1583]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 13 11:00:09 launchpad ollama[1583]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 13 11:00:09 launchpad ollama[1583]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 13 11:00:09 launchpad ollama[1583]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: n_ctx      = 8192
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: n_batch    = 512
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: n_ubatch   = 512
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: flash_attn = 0
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: freq_scale = 1
+Mar 13 11:00:10 launchpad ollama[1583]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: graph nodes  = 1030
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: graph splits = 2
+Mar 13 11:00:10 launchpad ollama[48847]: INFO [main] model loaded | tid="140290989002752" timestamp=1741888810
+Mar 13 11:00:10 launchpad ollama[1583]: time=2025-03-13T11:00:10.524-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Mar 13 11:00:10 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:00:10 | 200 |  6.457434451s |       127.0.0.1 | POST     "/api/generate"
+Mar 13 11:01:45 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:01:45 | 200 |  8.931576314s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 11:05:43 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:05:43 | 200 |    7.4592717s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 11:07:19 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:07:19 | 200 |  7.940772107s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.000-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9195487232 required="6.2 GiB"
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.000-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.001-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.002-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1197272828/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36807"
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.002-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.002-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.002-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 13 11:16:54 launchpad ollama[51210]: INFO [main] build info | build=0 commit="unknown" tid="139961012674560" timestamp=1741889814
+Mar 13 11:16:54 launchpad ollama[51210]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139961012674560" timestamp=1741889814 total_threads=16
+Mar 13 11:16:54 launchpad ollama[51210]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36807" tid="139961012674560" timestamp=1741889814
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - type  f32:   65 tensors
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - type q4_0:  225 tensors
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - type q6_K:    1 tensors
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_vocab: special tokens cache size = 256
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.253-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: arch             = llama
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: vocab type       = BPE
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_vocab          = 128256
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_merges         = 280147
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: vocab_only       = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_embd           = 4096
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_layer          = 32
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_head           = 32
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_head_kv        = 8
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_rot            = 128
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_swa            = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_gqa            = 4
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_ff             = 14336
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_expert         = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_expert_used    = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: causal attn      = 1
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: pooling type     = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: rope type        = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: rope scaling     = linear
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: freq_scale_train = 1
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: ssm_d_state      = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: model type       = 8B
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: model ftype      = Q4_0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: model params     = 8.03 B
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: max token length = 256
+Mar 13 11:16:54 launchpad ollama[1583]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 13 11:16:54 launchpad ollama[1583]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 13 11:16:54 launchpad ollama[1583]: ggml_cuda_init: found 1 CUDA devices:
+Mar 13 11:16:54 launchpad ollama[1583]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: n_ctx      = 8192
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: n_batch    = 512
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: n_ubatch   = 512
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: flash_attn = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: freq_scale = 1
+Mar 13 11:16:54 launchpad ollama[1583]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: graph nodes  = 1030
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: graph splits = 2
+Mar 13 11:16:55 launchpad ollama[51210]: INFO [main] model loaded | tid="139961012674560" timestamp=1741889815
+Mar 13 11:16:55 launchpad ollama[1583]: time=2025-03-13T11:16:55.258-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 13 11:17:04 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:17:04 | 200 | 10.685317637s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.089-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9195487232 required="6.2 GiB"
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.089-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.089-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.090-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1197272828/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34067"
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.090-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.090-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.091-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 13 11:24:32 launchpad ollama[52283]: INFO [main] build info | build=0 commit="unknown" tid="140571679039488" timestamp=1741890272
+Mar 13 11:24:32 launchpad ollama[52283]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140571679039488" timestamp=1741890272 total_threads=16
+Mar 13 11:24:32 launchpad ollama[52283]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34067" tid="140571679039488" timestamp=1741890272
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - type  f32:   65 tensors
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - type q4_0:  225 tensors
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - type q6_K:    1 tensors
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_vocab: special tokens cache size = 256
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.341-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: arch             = llama
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: vocab type       = BPE
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_vocab          = 128256
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_merges         = 280147
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: vocab_only       = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_embd           = 4096
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_layer          = 32
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_head           = 32
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_head_kv        = 8
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_rot            = 128
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_swa            = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_gqa            = 4
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_ff             = 14336
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_expert         = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_expert_used    = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: causal attn      = 1
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: pooling type     = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: rope type        = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: rope scaling     = linear
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: freq_scale_train = 1
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: ssm_d_state      = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: model type       = 8B
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: model ftype      = Q4_0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: model params     = 8.03 B
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: max token length = 256
+Mar 13 11:24:32 launchpad ollama[1583]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 13 11:24:32 launchpad ollama[1583]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 13 11:24:32 launchpad ollama[1583]: ggml_cuda_init: found 1 CUDA devices:
+Mar 13 11:24:32 launchpad ollama[1583]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: n_ctx      = 8192
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: n_batch    = 512
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: n_ubatch   = 512
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: flash_attn = 0
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: freq_scale = 1
+Mar 13 11:24:33 launchpad ollama[1583]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: graph nodes  = 1030
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: graph splits = 2
+Mar 13 11:24:33 launchpad ollama[52283]: INFO [main] model loaded | tid="140571679039488" timestamp=1741890273
+Mar 13 11:24:33 launchpad ollama[1583]: time=2025-03-13T11:24:33.094-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 13 11:24:42 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:24:42 | 200 | 10.199471089s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 11:28:28 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:28:28 | 200 |   5.64483084s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 11:29:05 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 11:29:06 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 11:29:06 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 11:29:06 launchpad systemd[1]: ollama.service: Consumed 58.161s CPU time, 5.5G memory peak, 4.6G read from disk, 508.1M written to disk, 932.9K incoming IP traffic, 1.1M outgoing IP traffic.
+-- Boot c1f31604bee640999d055f7bd130d882 --
+Mar 13 11:31:06 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 11:31:06 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 11:31:06 launchpad ollama[1580]: 2025/03/13 11:31:06 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 11:31:06 launchpad ollama[1580]: time=2025-03-13T11:31:06.372-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 11:31:06 launchpad ollama[1580]: time=2025-03-13T11:31:06.377-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 11:31:06 launchpad ollama[1580]: time=2025-03-13T11:31:06.378-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 11:31:06 launchpad ollama[1580]: time=2025-03-13T11:31:06.381-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1311511704/runners
+Mar 13 11:31:09 launchpad ollama[1580]: time=2025-03-13T11:31:09.377-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 11:31:09 launchpad ollama[1580]: time=2025-03-13T11:31:09.377-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 11:31:09 launchpad ollama[1580]: time=2025-03-13T11:31:09.377-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 11:31:09 launchpad ollama[1580]: time=2025-03-13T11:31:09.377-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 11:31:09 launchpad ollama[1580]: time=2025-03-13T11:31:09.377-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 11:31:09 launchpad ollama[1580]: time=2025-03-13T11:31:09.620-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 13 11:35:54 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 11:35:54 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 11:35:54 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 11:35:54 launchpad systemd[1]: ollama.service: Consumed 3.420s CPU time, 786.7M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot 3acb00d439a341faad90f7e8bfd12807 --
+Mar 13 11:36:26 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 11:36:26 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 11:36:26 launchpad ollama[1579]: 2025/03/13 11:36:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 11:36:26 launchpad ollama[1579]: time=2025-03-13T11:36:26.740-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 11:36:26 launchpad ollama[1579]: time=2025-03-13T11:36:26.746-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 11:36:26 launchpad ollama[1579]: time=2025-03-13T11:36:26.747-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 11:36:26 launchpad ollama[1579]: time=2025-03-13T11:36:26.748-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1006856464/runners
+Mar 13 11:36:29 launchpad ollama[1579]: time=2025-03-13T11:36:29.757-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 11:36:29 launchpad ollama[1579]: time=2025-03-13T11:36:29.758-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 11:36:29 launchpad ollama[1579]: time=2025-03-13T11:36:29.758-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 11:36:29 launchpad ollama[1579]: time=2025-03-13T11:36:29.758-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 11:36:29 launchpad ollama[1579]: time=2025-03-13T11:36:29.758-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 11:36:29 launchpad ollama[1579]: time=2025-03-13T11:36:29.996-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 13 12:06:05 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 12:06:05 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 12:06:05 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 12:06:05 launchpad systemd[1]: ollama.service: Consumed 3.474s CPU time, 787.1M memory peak, 233.6M read from disk, 508.1M written to disk.
+-- Boot aee0d4f29c7e48a8833a7dec2be986b1 --
+Mar 13 12:06:37 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 12:06:37 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 12:06:38 launchpad ollama[1534]: 2025/03/13 12:06:38 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 12:06:38 launchpad ollama[1534]: time=2025-03-13T12:06:38.073-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 12:06:38 launchpad ollama[1534]: time=2025-03-13T12:06:38.078-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 12:06:38 launchpad ollama[1534]: time=2025-03-13T12:06:38.079-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 12:06:38 launchpad ollama[1534]: time=2025-03-13T12:06:38.080-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2809634376/runners
+Mar 13 12:06:41 launchpad ollama[1534]: time=2025-03-13T12:06:41.062-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 12:06:41 launchpad ollama[1534]: time=2025-03-13T12:06:41.063-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 12:06:41 launchpad ollama[1534]: time=2025-03-13T12:06:41.063-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:06:41 launchpad ollama[1534]: time=2025-03-13T12:06:41.064-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:06:41 launchpad ollama[1534]: time=2025-03-13T12:06:41.064-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:06:41 launchpad ollama[1534]: time=2025-03-13T12:06:41.279-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 13 12:07:34 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 12:07:34 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 12:07:34 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 12:07:34 launchpad systemd[1]: ollama.service: Consumed 3.412s CPU time, 786.5M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot baf5c96d122a45f295e18e1a02e3d587 --
+Mar 13 12:08:12 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 12:08:12 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 12:08:12 launchpad ollama[1534]: 2025/03/13 12:08:12 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 12:08:12 launchpad ollama[1534]: time=2025-03-13T12:08:12.695-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 12:08:12 launchpad ollama[1534]: time=2025-03-13T12:08:12.701-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 12:08:12 launchpad ollama[1534]: time=2025-03-13T12:08:12.702-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 12:08:12 launchpad ollama[1534]: time=2025-03-13T12:08:12.703-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4269305187/runners
+Mar 13 12:08:15 launchpad ollama[1534]: time=2025-03-13T12:08:15.601-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 12:08:15 launchpad ollama[1534]: time=2025-03-13T12:08:15.602-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 12:08:15 launchpad ollama[1534]: time=2025-03-13T12:08:15.602-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:08:15 launchpad ollama[1534]: time=2025-03-13T12:08:15.603-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:08:15 launchpad ollama[1534]: time=2025-03-13T12:08:15.603-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:08:17 launchpad ollama[1534]: time=2025-03-13T12:08:17.355-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Mar 13 12:08:34 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 12:08:35 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 12:08:35 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 12:08:35 launchpad systemd[1]: ollama.service: Consumed 5.225s CPU time, 786.9M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 157366d5108c4d06a7edb4c7cbaab0cf --
+Mar 13 12:09:05 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 12:09:05 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 12:09:05 launchpad ollama[1530]: 2025/03/13 12:09:05 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 12:09:05 launchpad ollama[1530]: time=2025-03-13T12:09:05.839-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 12:09:05 launchpad ollama[1530]: time=2025-03-13T12:09:05.844-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 12:09:05 launchpad ollama[1530]: time=2025-03-13T12:09:05.845-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 12:09:05 launchpad ollama[1530]: time=2025-03-13T12:09:05.847-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2657318819/runners
+Mar 13 12:09:08 launchpad ollama[1530]: time=2025-03-13T12:09:08.813-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Mar 13 12:09:08 launchpad ollama[1530]: time=2025-03-13T12:09:08.813-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 12:09:08 launchpad ollama[1530]: time=2025-03-13T12:09:08.814-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:09:08 launchpad ollama[1530]: time=2025-03-13T12:09:08.814-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:09:08 launchpad ollama[1530]: time=2025-03-13T12:09:08.814-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:09:09 launchpad ollama[1530]: time=2025-03-13T12:09:09.034-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 13 12:11:19 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 12:11:19 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 12:11:19 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 12:11:19 launchpad systemd[1]: ollama.service: Consumed 3.429s CPU time, 786.6M memory peak, 233.7M read from disk, 508.1M written to disk.
+-- Boot 5dbee777929942a9be95deb1417c3c46 --
+Mar 13 12:12:23 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 12:12:24 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 12:12:24 launchpad ollama[1530]: 2025/03/13 12:12:24 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 12:12:24 launchpad ollama[1530]: time=2025-03-13T12:12:24.109-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 12:12:24 launchpad ollama[1530]: time=2025-03-13T12:12:24.114-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 12:12:24 launchpad ollama[1530]: time=2025-03-13T12:12:24.115-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 12:12:24 launchpad ollama[1530]: time=2025-03-13T12:12:24.117-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2006416511/runners
+Mar 13 12:12:27 launchpad ollama[1530]: time=2025-03-13T12:12:27.101-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 12:12:27 launchpad ollama[1530]: time=2025-03-13T12:12:27.101-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 12:12:27 launchpad ollama[1530]: time=2025-03-13T12:12:27.101-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:12:27 launchpad ollama[1530]: time=2025-03-13T12:12:27.101-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:12:27 launchpad ollama[1530]: time=2025-03-13T12:12:27.101-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:12:27 launchpad ollama[1530]: time=2025-03-13T12:12:27.294-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 13 12:24:12 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 12:24:12 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 12:24:12 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 12:24:12 launchpad systemd[1]: ollama.service: Consumed 3.419s CPU time, 786.5M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot 9892e1de7a774876a48ebed024abc84d --
+Mar 13 12:24:44 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 12:24:45 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 12:24:45 launchpad ollama[1527]: 2025/03/13 12:24:45 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 12:24:45 launchpad ollama[1527]: time=2025-03-13T12:24:45.131-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 12:24:45 launchpad ollama[1527]: time=2025-03-13T12:24:45.136-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 12:24:45 launchpad ollama[1527]: time=2025-03-13T12:24:45.137-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 12:24:45 launchpad ollama[1527]: time=2025-03-13T12:24:45.138-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4001498712/runners
+Mar 13 12:24:48 launchpad ollama[1527]: time=2025-03-13T12:24:48.128-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 12:24:48 launchpad ollama[1527]: time=2025-03-13T12:24:48.128-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 12:24:48 launchpad ollama[1527]: time=2025-03-13T12:24:48.128-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:24:48 launchpad ollama[1527]: time=2025-03-13T12:24:48.129-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:24:48 launchpad ollama[1527]: time=2025-03-13T12:24:48.129-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:24:48 launchpad ollama[1527]: time=2025-03-13T12:24:48.327-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 13 14:30:09 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 14:30:09 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 14:30:09 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 14:30:09 launchpad systemd[1]: ollama.service: Consumed 3.514s CPU time, 786.7M memory peak, 233.7M read from disk, 508.1M written to disk.
+-- Boot 39d8f516509c4d6494bf51d29f68080d --
+Mar 13 14:30:51 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 14:30:51 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 14:30:51 launchpad ollama[1541]: 2025/03/13 14:30:51 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 14:30:51 launchpad ollama[1541]: time=2025-03-13T14:30:51.995-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 14:30:52 launchpad ollama[1541]: time=2025-03-13T14:30:52.000-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 14:30:52 launchpad ollama[1541]: time=2025-03-13T14:30:52.001-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 14:30:52 launchpad ollama[1541]: time=2025-03-13T14:30:52.002-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1028367990/runners
+Mar 13 14:30:54 launchpad ollama[1541]: time=2025-03-13T14:30:54.842-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Mar 13 14:30:54 launchpad ollama[1541]: time=2025-03-13T14:30:54.843-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 14:30:54 launchpad ollama[1541]: time=2025-03-13T14:30:54.843-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 14:30:54 launchpad ollama[1541]: time=2025-03-13T14:30:54.844-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 14:30:54 launchpad ollama[1541]: time=2025-03-13T14:30:54.844-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 14:30:56 launchpad ollama[1541]: time=2025-03-13T14:30:56.553-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Mar 13 14:31:29 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 14:31:29 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 14:31:29 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 14:31:29 launchpad systemd[1]: ollama.service: Consumed 5.153s CPU time, 786.9M memory peak, 233.9M read from disk, 508.1M written to disk.
+-- Boot 071f7081c98f4fdcaa6c2adb0b749244 --
+Mar 13 14:32:00 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 14:32:00 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 14:32:00 launchpad ollama[1553]: 2025/03/13 14:32:00 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 14:32:00 launchpad ollama[1553]: time=2025-03-13T14:32:00.916-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 14:32:00 launchpad ollama[1553]: time=2025-03-13T14:32:00.921-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 14:32:00 launchpad ollama[1553]: time=2025-03-13T14:32:00.922-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 14:32:00 launchpad ollama[1553]: time=2025-03-13T14:32:00.924-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama352866884/runners
+Mar 13 14:32:03 launchpad ollama[1553]: time=2025-03-13T14:32:03.855-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 14:32:03 launchpad ollama[1553]: time=2025-03-13T14:32:03.856-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 14:32:03 launchpad ollama[1553]: time=2025-03-13T14:32:03.856-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 14:32:03 launchpad ollama[1553]: time=2025-03-13T14:32:03.856-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 14:32:03 launchpad ollama[1553]: time=2025-03-13T14:32:03.856-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 14:32:04 launchpad ollama[1553]: time=2025-03-13T14:32:04.065-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 13 16:17:43 launchpad ollama[1553]: [GIN] 2025/03/13 - 16:17:43 | 200 |     557.539µs |       127.0.0.1 | HEAD     "/"
+Mar 13 16:17:43 launchpad ollama[1553]: [GIN] 2025/03/13 - 16:17:43 | 200 |    6.958071ms |       127.0.0.1 | POST     "/api/show"
+Mar 13 16:17:43 launchpad ollama[1553]: time=2025-03-13T16:17:43.906-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.058-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10023403520 required="9.2 GiB"
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.058-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.058-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.059-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 41581"
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.059-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.059-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.060-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 13 16:17:44 launchpad ollama[23651]: INFO [main] build info | build=0 commit="unknown" tid="140323271462912" timestamp=1741907864
+Mar 13 16:17:44 launchpad ollama[23651]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140323271462912" timestamp=1741907864 total_threads=16
+Mar 13 16:17:44 launchpad ollama[23651]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41581" tid="140323271462912" timestamp=1741907864
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 13 16:17:44 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 13 16:17:44 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 13 16:17:44 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 13 16:17:44 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.311-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llm_load_tensors: offloading 40 repeating layers to GPU
+Mar 13 16:17:52 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 13 16:17:52 launchpad ollama[1553]: llm_load_tensors: offloaded 41/41 layers to GPU
+Mar 13 16:17:52 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 13 16:17:52 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 13 16:17:52 launchpad ollama[23651]: INFO [main] model loaded | tid="140323271462912" timestamp=1741907872
+Mar 13 16:17:53 launchpad ollama[1553]: time=2025-03-13T16:17:53.085-07:00 level=INFO source=server.go:626 msg="llama runner started in 9.03 seconds"
+Mar 13 16:17:53 launchpad ollama[1553]: [GIN] 2025/03/13 - 16:17:53 | 200 |  9.182304682s |       127.0.0.1 | POST     "/api/generate"
+Mar 13 16:19:48 launchpad ollama[1553]: time=2025-03-13T16:19:48.407-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 13 16:19:58 launchpad ollama[1553]: [GIN] 2025/03/13 - 16:19:58 | 200 |  9.720090366s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 17:20:12 launchpad ollama[1553]: time=2025-03-13T17:20:12.887-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.038-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9956425728 required="9.2 GiB"
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.038-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.0 GiB" free_swap="68.9 GiB"
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.039-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.040-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 42287"
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.040-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.040-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.040-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 13 17:20:13 launchpad ollama[34476]: INFO [main] build info | build=0 commit="unknown" tid="139751712792576" timestamp=1741911613
+Mar 13 17:20:13 launchpad ollama[34476]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139751712792576" timestamp=1741911613 total_threads=16
+Mar 13 17:20:13 launchpad ollama[34476]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42287" tid="139751712792576" timestamp=1741911613
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 13 17:20:13 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 13 17:20:13 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 13 17:20:13 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 13 17:20:13 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.331-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_tensors: offloading 40 repeating layers to GPU
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_tensors: offloaded 41/41 layers to GPU
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 13 17:20:14 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 13 17:20:14 launchpad ollama[34476]: INFO [main] model loaded | tid="139751712792576" timestamp=1741911614
+Mar 13 17:20:14 launchpad ollama[1553]: time=2025-03-13T17:20:14.334-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Mar 13 17:20:29 launchpad ollama[1553]: [GIN] 2025/03/13 - 17:20:29 | 200 | 16.673622598s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 17:21:51 launchpad ollama[1553]: time=2025-03-13T17:21:51.661-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 13 17:21:57 launchpad ollama[1553]: [GIN] 2025/03/13 - 17:21:57 | 200 |  5.496144275s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.221-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.376-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9958457344 required="9.2 GiB"
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.376-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.0 GiB" free_swap="68.9 GiB"
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.376-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.377-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 38611"
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.377-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.377-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.378-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 13 17:37:50 launchpad ollama[38232]: INFO [main] build info | build=0 commit="unknown" tid="139831091875840" timestamp=1741912670
+Mar 13 17:37:50 launchpad ollama[38232]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139831091875840" timestamp=1741912670 total_threads=16
+Mar 13 17:37:50 launchpad ollama[38232]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38611" tid="139831091875840" timestamp=1741912670
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 13 17:37:50 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 13 17:37:50 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 13 17:37:50 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 13 17:37:50 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.667-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_tensors: offloading 40 repeating layers to GPU
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_tensors: offloaded 41/41 layers to GPU
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 13 17:37:51 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 13 17:37:51 launchpad ollama[38232]: INFO [main] model loaded | tid="139831091875840" timestamp=1741912671
+Mar 13 17:37:51 launchpad ollama[1553]: time=2025-03-13T17:37:51.671-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Mar 13 17:38:11 launchpad ollama[1553]: [GIN] 2025/03/13 - 17:38:11 | 200 |  20.86947765s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 17:39:02 launchpad ollama[1553]: time=2025-03-13T17:39:02.693-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 13 17:39:13 launchpad ollama[1553]: [GIN] 2025/03/13 - 17:39:13 | 200 | 11.067425774s |       127.0.0.1 | POST     "/api/chat"
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.689-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.851-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.852-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.853-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 40861"
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.853-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.853-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.853-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 14 10:33:03 launchpad ollama[81310]: INFO [main] build info | build=0 commit="unknown" tid="139884910071808" timestamp=1741973583
+Mar 14 10:33:03 launchpad ollama[81310]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139884910071808" timestamp=1741973583 total_threads=16
+Mar 14 10:33:03 launchpad ollama[81310]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40861" tid="139884910071808" timestamp=1741973583
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 14 10:33:03 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 14 10:33:03 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 14 10:33:03 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 14 10:33:03 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: time=2025-03-14T10:33:04.137-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 14 10:33:04 launchpad ollama[1553]: llm_load_tensors: offloading 40 repeating layers to GPU
+Mar 14 10:33:04 launchpad ollama[1553]: llm_load_tensors: offloaded 40/41 layers to GPU
+Mar 14 10:33:04 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 14 10:33:04 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 4
+Mar 14 10:33:04 launchpad ollama[81310]: INFO [main] model loaded | tid="139884910071808" timestamp=1741973584
+Mar 14 10:33:05 launchpad ollama[1553]: time=2025-03-14T10:33:05.141-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Mar 14 10:33:19 launchpad ollama[1553]: [GIN] 2025/03/14 - 10:33:19 | 200 | 15.442020204s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 14:11:55 launchpad ollama[1553]: [GIN] 2025/03/15 - 14:11:55 | 200 |      14.236µs |       127.0.0.1 | HEAD     "/"
+Mar 15 14:11:55 launchpad ollama[1553]: [GIN] 2025/03/15 - 14:11:55 | 200 |    3.460728ms |       127.0.0.1 | POST     "/api/show"
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.543-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.704-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.5 GiB" free_swap="68.9 GiB"
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.704-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.706-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 37509"
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.706-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.706-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.706-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 15 14:11:55 launchpad ollama[197540]: INFO [main] build info | build=0 commit="unknown" tid="140427337490432" timestamp=1742073115
+Mar 15 14:11:55 launchpad ollama[197540]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140427337490432" timestamp=1742073115 total_threads=16
+Mar 15 14:11:55 launchpad ollama[197540]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37509" tid="140427337490432" timestamp=1742073115
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 15 14:11:55 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 15 14:11:55 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 15 14:11:55 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 15 14:11:55 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.992-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_tensors: offloading 38 repeating layers to GPU
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_tensors: offloaded 38/41 layers to GPU
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 15 14:11:56 launchpad ollama[1553]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 26
+Mar 15 14:11:56 launchpad ollama[197540]: INFO [main] model loaded | tid="140427337490432" timestamp=1742073116
+Mar 15 14:11:56 launchpad ollama[1553]: time=2025-03-15T14:11:56.996-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Mar 15 14:11:56 launchpad ollama[1553]: [GIN] 2025/03/15 - 14:11:56 | 200 |  1.455882801s |       127.0.0.1 | POST     "/api/generate"
+Mar 15 14:15:38 launchpad ollama[1553]: time=2025-03-15T14:15:38.730-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 14:15:57 launchpad ollama[1553]: [GIN] 2025/03/15 - 14:15:57 | 200 | 19.213770804s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 14:16:25 launchpad ollama[1553]: time=2025-03-15T14:16:25.081-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 14:16:47 launchpad ollama[1553]: [GIN] 2025/03/15 - 14:16:47 | 200 | 22.249997596s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.422-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.580-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.3 GiB" free_swap="68.9 GiB"
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.581-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.582-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 44303"
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.582-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.582-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.582-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 15 14:56:43 launchpad ollama[235944]: INFO [main] build info | build=0 commit="unknown" tid="140708567494656" timestamp=1742075803
+Mar 15 14:56:43 launchpad ollama[235944]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140708567494656" timestamp=1742075803 total_threads=16
+Mar 15 14:56:43 launchpad ollama[235944]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44303" tid="140708567494656" timestamp=1742075803
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 15 14:56:43 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 15 14:56:43 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 15 14:56:43 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 15 14:56:43 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.870-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_tensors: offloading 38 repeating layers to GPU
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_tensors: offloaded 38/41 layers to GPU
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 15 14:56:44 launchpad ollama[1553]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 26
+Mar 15 14:56:44 launchpad ollama[235944]: INFO [main] model loaded | tid="140708567494656" timestamp=1742075804
+Mar 15 14:56:44 launchpad ollama[1553]: time=2025-03-15T14:56:44.874-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Mar 15 14:56:57 launchpad ollama[1553]: [GIN] 2025/03/15 - 14:56:57 | 200 | 14.438564166s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.516-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.673-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.3 GiB" free_swap="68.9 GiB"
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.674-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.675-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 42201"
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.675-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.675-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.675-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 15 15:08:49 launchpad ollama[261636]: INFO [main] build info | build=0 commit="unknown" tid="139798432841728" timestamp=1742076529
+Mar 15 15:08:49 launchpad ollama[261636]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139798432841728" timestamp=1742076529 total_threads=16
+Mar 15 15:08:49 launchpad ollama[261636]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42201" tid="139798432841728" timestamp=1742076529
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 15 15:08:49 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 15 15:08:49 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 15 15:08:49 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 15 15:08:49 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.960-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_tensors: offloading 38 repeating layers to GPU
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_tensors: offloaded 38/41 layers to GPU
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 15 15:08:50 launchpad ollama[1553]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 26
+Mar 15 15:08:50 launchpad ollama[261636]: INFO [main] model loaded | tid="139798432841728" timestamp=1742076530
+Mar 15 15:08:50 launchpad ollama[1553]: time=2025-03-15T15:08:50.964-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Mar 15 15:09:12 launchpad ollama[1553]: [GIN] 2025/03/15 - 15:09:12 | 200 | 22.680878508s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.593-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.750-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="50.8 GiB" free_swap="68.9 GiB"
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.750-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.751-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 38893"
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.751-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.751-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.751-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 15 18:14:32 launchpad ollama[976378]: INFO [main] build info | build=0 commit="unknown" tid="140515044581376" timestamp=1742087672
+Mar 15 18:14:32 launchpad ollama[976378]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140515044581376" timestamp=1742087672 total_threads=16
+Mar 15 18:14:32 launchpad ollama[976378]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38893" tid="140515044581376" timestamp=1742087672
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 15 18:14:32 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 15 18:14:32 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 15 18:14:32 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 15 18:14:32 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: time=2025-03-15T18:14:33.053-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 15 18:14:33 launchpad ollama[1553]: llm_load_tensors: offloading 38 repeating layers to GPU
+Mar 15 18:14:33 launchpad ollama[1553]: llm_load_tensors: offloaded 38/41 layers to GPU
+Mar 15 18:14:33 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 15 18:14:33 launchpad ollama[1553]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 26
+Mar 15 18:14:33 launchpad ollama[976378]: INFO [main] model loaded | tid="140515044581376" timestamp=1742087673
+Mar 15 18:14:34 launchpad ollama[1553]: time=2025-03-15T18:14:34.056-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Mar 15 18:14:48 launchpad ollama[1553]: [GIN] 2025/03/15 - 18:14:48 | 200 | 15.625807839s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 18:16:42 launchpad ollama[1553]: time=2025-03-15T18:16:42.938-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 18:16:51 launchpad ollama[1553]: [GIN] 2025/03/15 - 18:16:51 | 200 |  8.085941694s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 18:19:54 launchpad ollama[1553]: time=2025-03-15T18:19:54.708-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 18:20:11 launchpad ollama[1553]: [GIN] 2025/03/15 - 18:20:11 | 200 | 16.835525025s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.266-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.417-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="50.8 GiB" free_swap="68.9 GiB"
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.417-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.418-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 41137"
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.418-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.418-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.418-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 15 18:37:59 launchpad ollama[988720]: INFO [main] build info | build=0 commit="unknown" tid="140524964081664" timestamp=1742089079
+Mar 15 18:37:59 launchpad ollama[988720]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140524964081664" timestamp=1742089079 total_threads=16
+Mar 15 18:37:59 launchpad ollama[988720]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41137" tid="140524964081664" timestamp=1742089079
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 15 18:37:59 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 15 18:37:59 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 15 18:37:59 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 15 18:37:59 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.711-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_tensors: offloading 38 repeating layers to GPU
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_tensors: offloaded 38/41 layers to GPU
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 15 18:38:00 launchpad ollama[1553]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 26
+Mar 15 18:38:00 launchpad ollama[988720]: INFO [main] model loaded | tid="140524964081664" timestamp=1742089080
+Mar 15 18:38:00 launchpad ollama[1553]: time=2025-03-15T18:38:00.715-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Mar 15 18:38:19 launchpad ollama[1553]: [GIN] 2025/03/15 - 18:38:19 | 200 | 20.309878338s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:01:08 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:01:08 | 200 |       21.15µs |       127.0.0.1 | HEAD     "/"
+Mar 19 14:01:08 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:01:08 | 200 |   17.330488ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.770-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9252044800 required="6.2 GiB"
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.770-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.770-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.772-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46863"
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.772-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.772-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.772-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 14:01:08 launchpad ollama[1061925]: INFO [main] build info | build=0 commit="unknown" tid="140604567769088" timestamp=1742418068
+Mar 19 14:01:08 launchpad ollama[1061925]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140604567769088" timestamp=1742418068 total_threads=16
+Mar 19 14:01:08 launchpad ollama[1061925]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46863" tid="140604567769088" timestamp=1742418068
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 14:01:09 launchpad ollama[1553]: time=2025-03-19T14:01:09.023-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 14:01:09 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 14:01:09 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 14:01:09 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 14:01:09 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 14:01:14 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 14:01:14 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 14:01:14 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 8192
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 14:01:14 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 14:01:14 launchpad ollama[1061925]: INFO [main] model loaded | tid="140604567769088" timestamp=1742418074
+Mar 19 14:01:14 launchpad ollama[1553]: time=2025-03-19T14:01:14.787-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.02 seconds"
+Mar 19 14:01:14 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:01:14 | 200 |  6.210392007s |       127.0.0.1 | POST     "/api/generate"
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.010-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9287958528 required="6.2 GiB"
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.010-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.010-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.011-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35981"
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.011-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.011-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.011-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 14:07:58 launchpad ollama[1062918]: INFO [main] build info | build=0 commit="unknown" tid="140374145241088" timestamp=1742418478
+Mar 19 14:07:58 launchpad ollama[1062918]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140374145241088" timestamp=1742418478 total_threads=16
+Mar 19 14:07:58 launchpad ollama[1062918]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35981" tid="140374145241088" timestamp=1742418478
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.262-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 14:07:58 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 14:07:58 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 14:07:58 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 14:07:58 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 8192
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 14:07:58 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 14:07:59 launchpad ollama[1062918]: INFO [main] model loaded | tid="140374145241088" timestamp=1742418479
+Mar 19 14:07:59 launchpad ollama[1553]: time=2025-03-19T14:07:59.014-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 19 14:08:06 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:08:06 | 200 |  8.475536837s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:10:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:10:11 | 200 |  6.938443807s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.104-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9321512960 required="6.2 GiB"
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.105-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.7 GiB" free_swap="68.9 GiB"
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.105-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.106-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34483"
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.106-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.106-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.106-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 14:15:34 launchpad ollama[1064012]: INFO [main] build info | build=0 commit="unknown" tid="140055665532928" timestamp=1742418934
+Mar 19 14:15:34 launchpad ollama[1064012]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140055665532928" timestamp=1742418934 total_threads=16
+Mar 19 14:15:34 launchpad ollama[1064012]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34483" tid="140055665532928" timestamp=1742418934
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.357-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 14:15:34 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 14:15:34 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 14:15:34 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 14:15:34 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 8192
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 14:15:35 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 14:15:35 launchpad ollama[1064012]: INFO [main] model loaded | tid="140055665532928" timestamp=1742418935
+Mar 19 14:15:35 launchpad ollama[1553]: time=2025-03-19T14:15:35.110-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 19 14:15:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:15:44 | 200 | 10.615297126s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:19:16 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:19:16 | 200 |  6.355186351s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.105-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9318891520 required="6.2 GiB"
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.105-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.105-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.106-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42893"
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.106-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.106-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.106-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 14:25:01 launchpad ollama[1065345]: INFO [main] build info | build=0 commit="unknown" tid="140046024835072" timestamp=1742419501
+Mar 19 14:25:01 launchpad ollama[1065345]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140046024835072" timestamp=1742419501 total_threads=16
+Mar 19 14:25:01 launchpad ollama[1065345]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42893" tid="140046024835072" timestamp=1742419501
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.358-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 14:25:01 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 14:25:01 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 14:25:01 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 14:25:01 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 8192
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 14:25:02 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 14:25:02 launchpad ollama[1065345]: INFO [main] model loaded | tid="140046024835072" timestamp=1742419502
+Mar 19 14:25:02 launchpad ollama[1553]: time=2025-03-19T14:25:02.361-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 14:25:12 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:25:12 | 200 | 11.793706316s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:28:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:28:11 | 200 |  2.301137665s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:29:28 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:29:28 | 200 |  1.269617761s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:30:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:30:37 | 200 |  1.314152996s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:33:21 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:33:21 | 200 |  1.357600299s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:35:14 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:35:14 | 200 |  5.456157523s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:37:34 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:37:34 | 200 |  4.108627358s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:46:10 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:46:10 | 200 |      20.479µs |       127.0.0.1 | HEAD     "/"
+Mar 19 14:46:10 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:46:10 | 200 |    1.593569ms |       127.0.0.1 | GET      "/api/tags"
+Mar 19 14:46:24 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:46:24 | 200 |      15.723µs |       127.0.0.1 | HEAD     "/"
+Mar 19 14:46:24 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:46:24 | 200 |   12.732467ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 14:48:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:48:37 | 200 |      15.975µs |       127.0.0.1 | HEAD     "/"
+Mar 19 14:48:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:48:37 | 200 |   12.547593ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 14:56:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:56:44 | 200 |      15.458µs |       127.0.0.1 | HEAD     "/"
+Mar 19 14:56:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:56:44 | 200 |   12.842438ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 14:56:49 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:56:49 | 200 |      15.453µs |       127.0.0.1 | HEAD     "/"
+Mar 19 14:56:49 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:56:49 | 200 |   12.983892ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.143-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9286909952 required="6.2 GiB"
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.143-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.8 GiB" free_swap="68.9 GiB"
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.144-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.145-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40963"
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.145-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.145-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.145-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 14:56:50 launchpad ollama[1070359]: INFO [main] build info | build=0 commit="unknown" tid="139701962080256" timestamp=1742421410
+Mar 19 14:56:50 launchpad ollama[1070359]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139701962080256" timestamp=1742421410 total_threads=16
+Mar 19 14:56:50 launchpad ollama[1070359]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40963" tid="139701962080256" timestamp=1742421410
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.397-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 14:56:50 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 14:56:50 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 14:56:50 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 14:56:50 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 8192
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 14:56:51 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 14:56:51 launchpad ollama[1070359]: INFO [main] model loaded | tid="139701962080256" timestamp=1742421411
+Mar 19 14:56:51 launchpad ollama[1553]: time=2025-03-19T14:56:51.149-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 19 14:56:51 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:56:51 | 200 |  1.192376819s |       127.0.0.1 | POST     "/api/generate"
+Mar 19 14:57:46 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:57:46 | 200 |  7.790673944s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:58:02 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:58:02 | 200 |  541.928223ms |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:58:41 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:58:41 | 200 |  750.543335ms |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:00:08 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:00:08 | 200 |      24.017µs |       127.0.0.1 | HEAD     "/"
+Mar 19 15:00:08 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:00:08 | 200 |      55.217µs |       127.0.0.1 | GET      "/api/ps"
+Mar 19 15:05:52 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:05:52 | 200 |      16.498µs |       127.0.0.1 | HEAD     "/"
+Mar 19 15:05:52 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:05:52 | 200 |   12.533954ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 15:09:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:09:53 | 200 |      15.373µs |       127.0.0.1 | HEAD     "/"
+Mar 19 15:10:05 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:05 | 200 |       20.55µs |       127.0.0.1 | HEAD     "/"
+Mar 19 15:10:07 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:07 | 200 |      48.615µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 15:10:07 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:07 | 200 |   29.193307ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 15:10:14 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:14 | 200 |      14.646µs |       127.0.0.1 | HEAD     "/"
+Mar 19 15:10:14 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:14 | 200 |     766.332µs |       127.0.0.1 | GET      "/api/tags"
+Mar 19 15:10:35 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:35 | 200 |      22.106µs |       127.0.0.1 | HEAD     "/"
+Mar 19 15:10:35 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:35 | 200 |   12.866448ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.909-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9301458944 required="7.7 GiB"
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.909-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.7 GiB" free_swap="68.9 GiB"
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.910-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.911-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 35497"
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.911-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.911-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.911-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 15:10:35 launchpad ollama[1073468]: INFO [main] build info | build=0 commit="unknown" tid="139727709519872" timestamp=1742422235
+Mar 19 15:10:35 launchpad ollama[1073468]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139727709519872" timestamp=1742422235 total_threads=16
+Mar 19 15:10:35 launchpad ollama[1073468]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35497" tid="139727709519872" timestamp=1742422235
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 15:10:36 launchpad ollama[1553]: time=2025-03-19T15:10:36.163-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 15:10:36 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 15:10:36 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 15:10:36 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 15:10:36 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 15:10:36 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 15:10:36 launchpad ollama[1073468]: INFO [main] model loaded | tid="139727709519872" timestamp=1742422236
+Mar 19 15:10:36 launchpad ollama[1553]: time=2025-03-19T15:10:36.920-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 19 15:10:36 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:36 | 200 |  1.198454452s |       127.0.0.1 | POST     "/api/generate"
+Mar 19 15:10:49 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:49 | 200 |  6.835704028s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.524-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9278390272 required="7.7 GiB"
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.524-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.7 GiB" free_swap="68.9 GiB"
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.524-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.525-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 38595"
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.525-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.525-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.525-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 15:16:09 launchpad ollama[1074353]: INFO [main] build info | build=0 commit="unknown" tid="140121490862080" timestamp=1742422569
+Mar 19 15:16:09 launchpad ollama[1074353]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140121490862080" timestamp=1742422569 total_threads=16
+Mar 19 15:16:09 launchpad ollama[1074353]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38595" tid="140121490862080" timestamp=1742422569
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.776-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 15:16:09 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 15:16:09 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 15:16:09 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 15:16:09 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 15:16:10 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 15:16:10 launchpad ollama[1074353]: INFO [main] model loaded | tid="140121490862080" timestamp=1742422570
+Mar 19 15:16:10 launchpad ollama[1553]: time=2025-03-19T15:16:10.535-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 19 15:16:20 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:16:20 | 200 | 11.446151344s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.171-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9278390272 required="7.7 GiB"
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.171-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.7 GiB" free_swap="68.9 GiB"
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.171-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.172-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 37839"
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.172-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.172-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.172-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 15:24:01 launchpad ollama[1075479]: INFO [main] build info | build=0 commit="unknown" tid="139992453177344" timestamp=1742423041
+Mar 19 15:24:01 launchpad ollama[1075479]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139992453177344" timestamp=1742423041 total_threads=16
+Mar 19 15:24:01 launchpad ollama[1075479]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37839" tid="139992453177344" timestamp=1742423041
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.423-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 15:24:01 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 15:24:01 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 15:24:01 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 15:24:01 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 15:24:02 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 15:24:02 launchpad ollama[1075479]: INFO [main] model loaded | tid="139992453177344" timestamp=1742423042
+Mar 19 15:24:02 launchpad ollama[1553]: time=2025-03-19T15:24:02.427-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 15:24:12 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:24:12 | 200 | 11.890800678s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.189-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9281011712 required="7.7 GiB"
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.189-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.189-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.190-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 46409"
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.190-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.190-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.191-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 15:50:50 launchpad ollama[1079902]: INFO [main] build info | build=0 commit="unknown" tid="140440360804352" timestamp=1742424650
+Mar 19 15:50:50 launchpad ollama[1079902]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140440360804352" timestamp=1742424650 total_threads=16
+Mar 19 15:50:50 launchpad ollama[1079902]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46409" tid="140440360804352" timestamp=1742424650
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.442-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 15:50:50 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 15:50:50 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 15:50:50 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 15:50:50 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 15:50:51 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 15:50:51 launchpad ollama[1079902]: INFO [main] model loaded | tid="140440360804352" timestamp=1742424651
+Mar 19 15:50:51 launchpad ollama[1553]: time=2025-03-19T15:50:51.445-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 15:50:59 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:50:59 | 200 |  9.199214165s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:52:45 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:52:45 | 200 |  8.535873447s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:53:36 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:53:36 | 200 | 10.706122932s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:55:18 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:55:18 | 200 |  8.647136848s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:58:15 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:58:15 | 200 |  8.602667884s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:00:16 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:00:16 | 200 |  8.253027637s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:04:16 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:04:16 | 200 | 11.007915608s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.592-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9281536000 required="7.7 GiB"
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.592-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.593-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.594-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 39179"
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.594-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.594-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.594-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 16:12:39 launchpad ollama[1083210]: INFO [main] build info | build=0 commit="unknown" tid="140212455632896" timestamp=1742425959
+Mar 19 16:12:39 launchpad ollama[1083210]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140212455632896" timestamp=1742425959 total_threads=16
+Mar 19 16:12:39 launchpad ollama[1083210]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39179" tid="140212455632896" timestamp=1742425959
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.845-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 16:12:39 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 16:12:39 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 16:12:39 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 16:12:39 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 16:12:40 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 16:12:40 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 16:12:40 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 16:12:40 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 16:12:40 launchpad ollama[1083210]: INFO [main] model loaded | tid="140212455632896" timestamp=1742425960
+Mar 19 16:12:40 launchpad ollama[1553]: time=2025-03-19T16:12:40.848-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 16:12:59 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:12:59 | 200 | 19.734470957s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:14:34 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:14:34 | 200 |  12.53505037s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.174-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9281536000 required="7.7 GiB"
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.174-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.175-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.175-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 38891"
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.176-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.176-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.176-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 16:24:32 launchpad ollama[1084909]: INFO [main] build info | build=0 commit="unknown" tid="140123967619072" timestamp=1742426672
+Mar 19 16:24:32 launchpad ollama[1084909]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140123967619072" timestamp=1742426672 total_threads=16
+Mar 19 16:24:32 launchpad ollama[1084909]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38891" tid="140123967619072" timestamp=1742426672
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.427-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 16:24:32 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 16:24:32 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 16:24:32 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 16:24:32 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 16:24:33 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 16:24:33 launchpad ollama[1084909]: INFO [main] model loaded | tid="140123967619072" timestamp=1742426673
+Mar 19 16:24:33 launchpad ollama[1553]: time=2025-03-19T16:24:33.431-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 16:24:52 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:24:52 | 200 | 20.794159326s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:29:23 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:29:23 | 200 |  8.824535282s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:30:46 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:30:46 | 200 |  6.784859875s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:37:12 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:37:12 | 200 |         4m41s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.147-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9280552960 required="7.7 GiB"
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.147-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.147-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.148-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 41117"
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.148-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.148-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.149-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 16:45:25 launchpad ollama[1087999]: INFO [main] build info | build=0 commit="unknown" tid="140624371212288" timestamp=1742427925
+Mar 19 16:45:25 launchpad ollama[1087999]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140624371212288" timestamp=1742427925 total_threads=16
+Mar 19 16:45:25 launchpad ollama[1087999]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41117" tid="140624371212288" timestamp=1742427925
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.399-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 16:45:25 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 16:45:25 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 16:45:25 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 16:45:25 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 16:45:26 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 16:45:26 launchpad ollama[1087999]: INFO [main] model loaded | tid="140624371212288" timestamp=1742427926
+Mar 19 16:45:26 launchpad ollama[1553]: time=2025-03-19T16:45:26.403-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 16:46:18 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:46:18 | 200 | 53.479928364s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:47:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:47:30 | 200 |   12.893698ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 16:50:22 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:50:22 | 200 |   12.498092ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 16:50:48 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:50:48 | 200 |   12.886171ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 16:55:32 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:55:32 | 200 |      15.615µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:55:32 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:55:32 | 200 |     453.855µs |       127.0.0.1 | GET      "/api/tags"
+Mar 19 16:55:55 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:55:55 | 200 |      50.492µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:55:55 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:55:55 | 200 |     1.44822ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 19 16:55:58 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:55:58 | 200 |      31.101µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:55:58 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:55:58 | 200 |     468.123µs |       127.0.0.1 | GET      "/api/tags"
+Mar 19 16:56:18 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:18 | 200 |      15.452µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:56:21 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:21 | 200 |      67.654µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 16:56:21 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:21 | 200 |   26.437434ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 16:56:33 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:33 | 200 |      15.129µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:56:33 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:33 | 200 |    1.261061ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 19 16:56:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:37 | 200 |      15.369µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:56:40 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:40 | 200 |      91.099µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 16:56:40 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:40 | 200 |   28.411029ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 16:56:48 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:48 | 200 |      19.801µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:56:48 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:48 | 200 |   12.680799ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.870-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9280552960 required="7.7 GiB"
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.870-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.871-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.872-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 43755"
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.872-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.872-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.872-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 16:56:48 launchpad ollama[1090073]: INFO [main] build info | build=0 commit="unknown" tid="140007767351296" timestamp=1742428608
+Mar 19 16:56:48 launchpad ollama[1090073]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140007767351296" timestamp=1742428608 total_threads=16
+Mar 19 16:56:48 launchpad ollama[1090073]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43755" tid="140007767351296" timestamp=1742428608
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 16:56:49 launchpad ollama[1553]: time=2025-03-19T16:56:49.123-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 16:56:49 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 16:56:49 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 16:56:49 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 16:56:49 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 16:56:49 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 16:56:49 launchpad ollama[1090073]: INFO [main] model loaded | tid="140007767351296" timestamp=1742428609
+Mar 19 16:56:50 launchpad ollama[1553]: time=2025-03-19T16:56:50.126-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 16:56:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:50 | 200 |  1.436506149s |       127.0.0.1 | POST     "/api/generate"
+Mar 19 16:59:16 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:59:16 | 200 |  3.779718793s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:59:40 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:59:40 | 200 |   3.14516953s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:00:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:00:50 | 200 |      14.881µs |       127.0.0.1 | HEAD     "/"
+Mar 19 17:00:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:00:50 | 200 |    1.513938ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 19 17:00:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:00:53 | 200 |      16.591µs |       127.0.0.1 | HEAD     "/"
+Mar 19 17:00:56 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:00:56 | 200 |      47.845µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 17:00:56 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:00:56 | 200 |   26.827483ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 17:01:01 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:01:01 | 200 |      16.344µs |       127.0.0.1 | HEAD     "/"
+Mar 19 17:01:01 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:01:01 | 200 |    13.00695ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 17:01:01 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:01:01 | 200 |    13.15404ms |       127.0.0.1 | POST     "/api/generate"
+Mar 19 17:01:05 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:01:05 | 200 |  1.045303354s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:01:24 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:01:24 | 200 |  1.878753637s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.820-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9280552960 required="7.7 GiB"
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.821-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.821-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.822-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 35723"
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.822-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.822-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.822-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 17:09:14 launchpad ollama[1092033]: INFO [main] build info | build=0 commit="unknown" tid="139957758668800" timestamp=1742429354
+Mar 19 17:09:14 launchpad ollama[1092033]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139957758668800" timestamp=1742429354 total_threads=16
+Mar 19 17:09:14 launchpad ollama[1092033]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35723" tid="139957758668800" timestamp=1742429354
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 17:09:15 launchpad ollama[1553]: time=2025-03-19T17:09:15.073-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 17:09:15 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 17:09:15 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 17:09:15 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 17:09:15 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 17:09:15 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 17:09:15 launchpad ollama[1092033]: INFO [main] model loaded | tid="139957758668800" timestamp=1742429355
+Mar 19 17:09:15 launchpad ollama[1553]: time=2025-03-19T17:09:15.830-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 19 17:09:26 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:09:26 | 200 | 11.549671136s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.028-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9280552960 required="7.7 GiB"
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.028-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.5 GiB" free_swap="68.9 GiB"
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.028-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.029-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 42231"
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.029-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.029-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.029-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 17:17:10 launchpad ollama[1093171]: INFO [main] build info | build=0 commit="unknown" tid="140655190937600" timestamp=1742429830
+Mar 19 17:17:10 launchpad ollama[1093171]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140655190937600" timestamp=1742429830 total_threads=16
+Mar 19 17:17:10 launchpad ollama[1093171]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42231" tid="140655190937600" timestamp=1742429830
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.280-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 17:17:10 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 17:17:10 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 17:17:10 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 17:17:10 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 17:17:10 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 17:17:11 launchpad ollama[1093171]: INFO [main] model loaded | tid="140655190937600" timestamp=1742429831
+Mar 19 17:17:11 launchpad ollama[1553]: time=2025-03-19T17:17:11.283-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 17:17:20 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:17:20 | 200 | 11.089850045s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:18:34 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:18:34 | 200 |  8.608081362s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:22:28 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:22:28 | 200 |  8.072222972s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:24:34 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:24:34 | 200 |  1.238467883s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:25:52 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:25:52 | 200 |  1.353795697s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:26:07 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:26:07 | 200 |   12.478808ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 17:26:17 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:26:17 | 200 |  1.430607912s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:26:47 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:26:47 | 200 |   12.951997ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 17:27:09 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:27:09 | 200 |   1.79384077s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:27:49 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:27:49 | 200 |   12.373791ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 17:28:02 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:28:02 | 200 |  8.495199451s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:32:40 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:32:40 | 200 |  2.553248593s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:32:52 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:32:52 | 200 |  7.599296148s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:33:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:33:53 | 200 |  2.814389526s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:34:06 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:34:06 | 200 |  9.097858226s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:34:54 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:34:54 | 200 |  5.215343626s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:35:07 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:35:07 | 200 |  3.618264464s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:35:21 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:35:21 | 200 |  3.527019179s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:36:12 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:36:12 | 200 |  7.495950982s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:36:18 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:36:18 | 200 |  4.124787454s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:36:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:36:30 | 200 |  5.509177899s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:37:05 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:37:05 | 200 |  5.768666334s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:37:19 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:37:19 | 200 |   6.10250155s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:38:01 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:38:01 | 200 | 11.900695987s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:38:19 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:38:19 | 200 | 12.448423864s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:39:06 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:39:06 | 200 |   13.272028ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 17:39:25 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:39:25 | 200 | 16.280344686s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:39:56 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:39:56 | 200 | 16.733725425s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.243-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9279242240 required="7.7 GiB"
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.243-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.5 GiB" free_swap="68.9 GiB"
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.243-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.244-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 41107"
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.245-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.245-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.245-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 18:15:10 launchpad ollama[1101363]: INFO [main] build info | build=0 commit="unknown" tid="140290009657344" timestamp=1742433310
+Mar 19 18:15:10 launchpad ollama[1101363]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140290009657344" timestamp=1742433310 total_threads=16
+Mar 19 18:15:10 launchpad ollama[1101363]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41107" tid="140290009657344" timestamp=1742433310
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.495-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 18:15:10 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 18:15:10 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 18:15:10 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 18:15:10 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 18:15:11 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 18:15:11 launchpad ollama[1101363]: INFO [main] model loaded | tid="140290009657344" timestamp=1742433311
+Mar 19 18:15:11 launchpad ollama[1553]: time=2025-03-19T18:15:11.499-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 18:15:27 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:15:27 | 200 | 17.277527942s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:31:16 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:31:16 | 200 |        15m13s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:32:23 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:32:23 | 200 |  5.721499854s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:32:51 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:32:51 | 200 |  6.442060577s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:33:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:33:53 | 200 |  3.703942131s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:34:31 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:34:31 | 200 |  2.968032978s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:36:24 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:36:24 | 200 |   2.75354646s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:36:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:36:53 | 200 |  4.395223699s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:37:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:37:11 | 200 |  7.272151729s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:37:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:37:53 | 200 |  6.228299736s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.053-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9264496640 required="7.7 GiB"
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.053-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.4 GiB" free_swap="68.9 GiB"
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.053-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.054-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 37565"
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.054-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.054-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.054-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 18:46:49 launchpad ollama[1106074]: INFO [main] build info | build=0 commit="unknown" tid="139917702578176" timestamp=1742435209
+Mar 19 18:46:49 launchpad ollama[1106074]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139917702578176" timestamp=1742435209 total_threads=16
+Mar 19 18:46:49 launchpad ollama[1106074]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37565" tid="139917702578176" timestamp=1742435209
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.305-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 18:46:49 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 18:46:49 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 18:46:49 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 18:46:49 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 18:46:49 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 18:46:50 launchpad ollama[1106074]: INFO [main] model loaded | tid="139917702578176" timestamp=1742435210
+Mar 19 18:46:50 launchpad ollama[1553]: time=2025-03-19T18:46:50.061-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 19 18:47:00 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:47:00 | 200 | 11.785600436s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:49:46 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:49:46 | 200 |  8.865631358s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:51:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:51:30 | 200 | 11.177185966s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:52:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:52:44 | 200 |  8.542652753s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:56:32 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:56:32 | 200 |  10.46354348s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:58:00 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:58:00 | 200 | 12.471833057s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:59:36 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:59:36 | 200 |  2.481295797s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:00:03 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:00:03 | 200 |  8.969396025s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:00:41 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:00:41 | 200 |  9.171149938s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:01:27 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:01:27 | 200 |  7.960006373s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:01:54 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:01:54 | 200 | 11.638280693s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:02:13 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:02:13 | 200 |  9.969332277s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:02:33 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:02:33 | 200 | 14.522601219s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:05:25 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:05:25 | 200 |         1m45s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:05:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:05:50 | 200 | 10.538393198s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:07:06 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:07:06 | 200 |  5.947861596s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:07:36 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:07:36 | 200 | 18.144623386s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:08:34 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:08:34 | 200 |   13.045318ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:08:43 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:08:43 | 200 |  4.751733702s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:09:18 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:09:18 | 200 |  4.804210665s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:10:22 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:10:22 | 200 |  5.464388284s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:11:39 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:11:39 | 200 |  4.175226817s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:13:15 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:13:15 | 200 |  3.617657116s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:13:58 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:13:58 | 200 |   12.832254ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:14:14 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:14:14 | 200 |   12.346782ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:14:28 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:14:28 | 200 |   13.036026ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:14:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:14:37 | 200 |   12.309953ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:14:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:14:53 | 200 |   12.602511ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:15:46 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:15:46 | 200 |  4.035332668s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:16:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:16:53 | 200 |  5.417289161s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:18:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:18:30 | 200 |  2.049013566s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:19:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:19:50 | 200 |  1.872510983s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:20:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:20:44 | 200 |   12.679108ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:21:59 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:21:59 | 200 |  2.108290057s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:22:01 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:22:01 | 200 |  2.060780391s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:26:27 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:27 | 200 |      16.242µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:26:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:30 | 200 |      46.924µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 19:26:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:30 | 200 |   23.596124ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 19:26:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:37 | 200 |      26.615µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:26:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:37 | 200 |   12.355854ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:26:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:37 | 200 |   12.726734ms |       127.0.0.1 | POST     "/api/generate"
+Mar 19 19:26:47 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:47 | 200 |  3.857925263s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:27:33 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:27:33 | 200 |   5.58264154s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:29:02 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:29:02 | 200 |   8.70896357s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:30:17 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:30:17 | 200 |  5.825537391s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:31:09 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:31:09 | 200 |  5.019266652s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:32:18 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:32:18 | 200 |  6.983706554s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:33:38 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:33:38 | 200 |  6.042807373s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:35:26 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:35:26 | 200 |  7.537894802s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:36:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:36:37 | 200 |      16.182µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:36:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:36:37 | 200 |    1.450239ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 19 19:42:38 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:42:38 | 200 |      15.143µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:42:41 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:42:41 | 200 |      55.022µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 19:42:41 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:42:41 | 200 |   24.173529ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 19:43:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:11 | 200 |      15.006µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:43:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:11 | 404 |      60.392µs |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:43:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:11 | 400 |     344.364µs |       127.0.0.1 | POST     "/api/pull"
+Mar 19 19:43:20 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:20 | 200 |      16.487µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:43:20 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:20 | 200 |   12.909453ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.063-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9074900992 required="7.7 GiB"
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.063-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.3 GiB" free_swap="68.9 GiB"
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.063-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.064-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 41931"
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.064-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.064-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.065-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 19:43:21 launchpad ollama[1115036]: INFO [main] build info | build=0 commit="unknown" tid="139693597380608" timestamp=1742438601
+Mar 19 19:43:21 launchpad ollama[1115036]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139693597380608" timestamp=1742438601 total_threads=16
+Mar 19 19:43:21 launchpad ollama[1115036]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41931" tid="139693597380608" timestamp=1742438601
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.315-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 19:43:21 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 19:43:21 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 19:43:21 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 19:43:21 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 19:43:21 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 19:43:22 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 19:43:22 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 19:43:22 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 19:43:22 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 19:43:22 launchpad ollama[1115036]: INFO [main] model loaded | tid="139693597380608" timestamp=1742438602
+Mar 19 19:43:22 launchpad ollama[1553]: time=2025-03-19T19:43:22.318-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 19:43:22 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:22 | 200 |  1.437816344s |       127.0.0.1 | POST     "/api/generate"
+Mar 19 19:43:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:30 | 200 |  4.306321761s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:44:33 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:44:33 | 200 |  8.974803804s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:47:09 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:47:09 | 200 |  7.410393098s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:54:48 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:54:48 | 200 |      16.315µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:54:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:54:50 | 200 |      47.754µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 19:54:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:54:50 | 200 |   24.445941ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 19:54:59 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:54:59 | 200 |      15.258µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:54:59 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:54:59 | 200 |   12.857306ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.952-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9148235776 required="7.7 GiB"
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.952-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.2 GiB" free_swap="68.9 GiB"
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.952-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.953-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 41817"
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.953-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.953-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.953-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 19:54:59 launchpad ollama[1117144]: INFO [main] build info | build=0 commit="unknown" tid="140643379798016" timestamp=1742439299
+Mar 19 19:54:59 launchpad ollama[1117144]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140643379798016" timestamp=1742439299 total_threads=16
+Mar 19 19:54:59 launchpad ollama[1117144]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41817" tid="140643379798016" timestamp=1742439299
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 19:55:00 launchpad ollama[1553]: time=2025-03-19T19:55:00.205-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 19:55:00 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 19:55:00 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 19:55:00 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 19:55:00 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 19:55:00 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 19:55:00 launchpad ollama[1117144]: INFO [main] model loaded | tid="140643379798016" timestamp=1742439300
+Mar 19 19:55:01 launchpad ollama[1553]: time=2025-03-19T19:55:01.208-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 19:55:01 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:55:01 | 200 |  1.442368701s |       127.0.0.1 | POST     "/api/generate"
+Mar 19 19:55:08 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:55:08 | 200 |  3.249757371s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:55:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:55:37 | 200 |   12.501524ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:56:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:56:11 | 200 |  9.066361293s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:57:46 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:57:46 | 200 |  3.335807135s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:58:28 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:58:28 | 200 |  3.875927335s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:59:31 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:59:31 | 200 |  5.199894045s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 20:00:15 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:00:15 | 200 |  5.955558681s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 20:01:56 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:01:56 | 200 |  5.641119179s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 20:54:26 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:54:26 | 200 |       19.73µs |       127.0.0.1 | HEAD     "/"
+Mar 19 20:54:26 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:54:26 | 200 |     602.314µs |       127.0.0.1 | GET      "/api/tags"
+Mar 19 20:55:47 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:55:47 | 200 |      16.687µs |       127.0.0.1 | HEAD     "/"
+Mar 19 20:55:47 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:55:47 | 200 |    3.535167ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 19 20:56:00 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:56:00 | 200 |      15.086µs |       127.0.0.1 | HEAD     "/"
+Mar 19 20:56:00 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:56:00 | 200 |    3.192762ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 19 20:56:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:56:44 | 200 |      14.135µs |       127.0.0.1 | HEAD     "/"
+Mar 19 20:56:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:56:44 | 200 |      15.995µs |       127.0.0.1 | GET      "/api/ps"
+Mar 20 17:21:26 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 20 17:21:26 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 20 17:21:26 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 20 17:21:26 launchpad systemd[1]: ollama.service: Consumed 43min 48.241s CPU time, 12.6G memory peak, 251.3M memory swap peak, 11.5G read from disk, 759.6M written to disk.
+-- Boot bf778c65c1c7444eafed20125c5a887b --
+Mar 20 17:22:03 launchpad systemd[1]: Starting Server for local large language models...
+Mar 20 17:22:03 launchpad systemd[1]: Started Server for local large language models.
+Mar 20 17:22:03 launchpad ollama[1530]: 2025/03/20 17:22:03 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 20 17:22:03 launchpad ollama[1530]: time=2025-03-20T17:22:03.293-07:00 level=INFO source=images.go:753 msg="total blobs: 26"
+Mar 20 17:22:03 launchpad ollama[1530]: time=2025-03-20T17:22:03.305-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 1"
+Mar 20 17:22:03 launchpad ollama[1530]: time=2025-03-20T17:22:03.307-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 20 17:22:03 launchpad ollama[1530]: time=2025-03-20T17:22:03.309-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3643862997/runners
+Mar 20 17:22:06 launchpad ollama[1530]: time=2025-03-20T17:22:06.346-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 20 17:22:06 launchpad ollama[1530]: time=2025-03-20T17:22:06.347-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 20 17:22:06 launchpad ollama[1530]: time=2025-03-20T17:22:06.347-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 20 17:22:06 launchpad ollama[1530]: time=2025-03-20T17:22:06.347-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 20 17:22:06 launchpad ollama[1530]: time=2025-03-20T17:22:06.347-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 20 17:22:06 launchpad ollama[1530]: time=2025-03-20T17:22:06.565-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 20 17:42:51 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:42:51 | 200 |    3.366137ms |       127.0.0.1 | GET      "/api/tags"
+Mar 20 17:43:36 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:43:36 | 200 |     481.038µs |       127.0.0.1 | GET      "/api/tags"
+Mar 20 17:43:36 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:43:36 | 200 |     514.798µs |       127.0.0.1 | GET      "/api/version"
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.572-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10108076032 required="7.7 GiB"
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.572-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.572-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.573-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3643862997/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 36721"
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.574-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.574-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.574-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 20 17:44:34 launchpad ollama[13493]: INFO [main] build info | build=0 commit="unknown" tid="139779810324480" timestamp=1742517874
+Mar 20 17:44:34 launchpad ollama[13493]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139779810324480" timestamp=1742517874 total_threads=16
+Mar 20 17:44:34 launchpad ollama[13493]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36721" tid="139779810324480" timestamp=1742517874
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - type  f32:   65 tensors
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - type q4_0:  225 tensors
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - type q6_K:    1 tensors
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.825-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_vocab: special tokens cache size = 256
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: arch             = llama
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: vocab type       = BPE
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_vocab          = 128256
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_merges         = 280147
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: vocab_only       = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_embd           = 4096
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_layer          = 32
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_head           = 32
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_head_kv        = 8
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_rot            = 128
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_swa            = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_gqa            = 4
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_ff             = 14336
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_expert         = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_expert_used    = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: causal attn      = 1
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: pooling type     = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: rope type        = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: rope scaling     = linear
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: freq_scale_train = 1
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: ssm_d_state      = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: model type       = 8B
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: model ftype      = Q4_0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: model params     = 8.03 B
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: max token length = 256
+Mar 20 17:44:34 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 20 17:44:34 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 20 17:44:34 launchpad ollama[1530]: ggml_cuda_init: found 1 CUDA devices:
+Mar 20 17:44:34 launchpad ollama[1530]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 20 17:44:40 launchpad ollama[1530]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 20 17:44:40 launchpad ollama[1530]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 20 17:44:40 launchpad ollama[1530]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: n_ctx      = 16384
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: n_batch    = 512
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: n_ubatch   = 512
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: flash_attn = 0
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: freq_scale = 1
+Mar 20 17:44:40 launchpad ollama[1530]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: graph nodes  = 1030
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: graph splits = 2
+Mar 20 17:44:40 launchpad ollama[13493]: INFO [main] model loaded | tid="139779810324480" timestamp=1742517880
+Mar 20 17:44:40 launchpad ollama[1530]: time=2025-03-20T17:44:40.842-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Mar 20 17:44:43 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:44:43 | 200 |  9.257646341s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:44:44 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:44:44 | 200 |  583.080887ms |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:44:45 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:44:45 | 200 |  1.466299486s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:45:57 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:45:57 | 200 |  5.815849459s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:47:14 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:47:14 | 200 |  6.980502359s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:49:18 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:49:18 | 200 |  5.583089859s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:50:19 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:50:19 | 200 |  7.638621575s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:54:36 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:54:36 | 200 |  7.323435618s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:57:30 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:57:30 | 200 |  8.047903525s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:00:19 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:00:19 | 200 |  6.289686262s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:02:11 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:02:11 | 200 |  8.495516305s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:05:02 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:05:02 | 200 |      40.555µs |       127.0.0.1 | GET      "/api/version"
+Mar 20 18:36:03 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:36:03 | 200 |      41.118µs |       127.0.0.1 | GET      "/api/version"
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.640-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10102964224 required="7.7 GiB"
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.640-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.640-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.641-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3643862997/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 46093"
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.641-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.641-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.642-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 20 18:36:37 launchpad ollama[55558]: INFO [main] build info | build=0 commit="unknown" tid="140706518519808" timestamp=1742520997
+Mar 20 18:36:37 launchpad ollama[55558]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140706518519808" timestamp=1742520997 total_threads=16
+Mar 20 18:36:37 launchpad ollama[55558]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46093" tid="140706518519808" timestamp=1742520997
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - type  f32:   65 tensors
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - type q4_0:  225 tensors
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - type q6_K:    1 tensors
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.893-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_vocab: special tokens cache size = 256
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: arch             = llama
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: vocab type       = BPE
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_vocab          = 128256
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_merges         = 280147
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: vocab_only       = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_embd           = 4096
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_layer          = 32
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_head           = 32
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_head_kv        = 8
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_rot            = 128
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_swa            = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_gqa            = 4
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_ff             = 14336
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_expert         = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_expert_used    = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: causal attn      = 1
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: pooling type     = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: rope type        = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: rope scaling     = linear
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: freq_scale_train = 1
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: ssm_d_state      = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: model type       = 8B
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: model ftype      = Q4_0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: model params     = 8.03 B
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: max token length = 256
+Mar 20 18:36:37 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 20 18:36:37 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 20 18:36:37 launchpad ollama[1530]: ggml_cuda_init: found 1 CUDA devices:
+Mar 20 18:36:37 launchpad ollama[1530]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 20 18:36:38 launchpad ollama[1530]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 20 18:36:38 launchpad ollama[1530]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 20 18:36:38 launchpad ollama[1530]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 20 18:36:38 launchpad ollama[1530]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: n_ctx      = 16384
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: n_batch    = 512
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: n_ubatch   = 512
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: flash_attn = 0
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: freq_scale = 1
+Mar 20 18:36:38 launchpad ollama[1530]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: graph nodes  = 1030
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: graph splits = 2
+Mar 20 18:36:38 launchpad ollama[55558]: INFO [main] model loaded | tid="140706518519808" timestamp=1742520998
+Mar 20 18:36:38 launchpad ollama[1530]: time=2025-03-20T18:36:38.895-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 20 18:36:53 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:36:53 | 200 | 15.969996668s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:39:50 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:39:50 | 200 |  6.703338511s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:41:51 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:41:51 | 200 |  6.534414397s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:45:03 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:45:03 | 200 | 10.047744622s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:47:31 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:47:31 | 200 |  7.488868866s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:52:22 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:52:22 | 200 |   7.42625173s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 11:18:44 launchpad ollama[1530]: [GIN] 2025/03/21 - 11:18:44 | 200 |      440.15µs |       127.0.0.1 | GET      "/api/tags"
+Mar 21 11:18:44 launchpad ollama[1530]: [GIN] 2025/03/21 - 11:18:44 | 200 |      43.008µs |       127.0.0.1 | GET      "/api/version"
+Mar 21 13:07:44 launchpad ollama[1530]: [GIN] 2025/03/21 - 13:07:44 | 200 |     503.978µs |       127.0.0.1 | GET      "/api/tags"
+Mar 21 14:24:55 launchpad ollama[1530]: [GIN] 2025/03/21 - 14:24:55 | 200 |     453.444µs |       127.0.0.1 | GET      "/api/tags"
+Mar 21 14:36:18 launchpad ollama[1530]: [GIN] 2025/03/21 - 14:36:18 | 200 |      30.713µs |       127.0.0.1 | GET      "/api/version"
+Mar 21 15:08:11 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:08:11 | 200 |      56.519µs |       127.0.0.1 | GET      "/api/version"
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.246-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=6543376384 required="5.1 GiB"
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.246-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="40.5 GiB" free_swap="68.9 GiB"
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.247-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[6.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="5.1 GiB" memory.required.partial="5.1 GiB" memory.required.kv="256.0 MiB" memory.required.allocations="[5.1 GiB]" memory.weights.total="3.9 GiB" memory.weights.repeating="3.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="258.5 MiB" memory.graph.partial="677.5 MiB"
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.248-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3643862997/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 36369"
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.248-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.248-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.248-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 21 15:09:04 launchpad ollama[538108]: INFO [main] build info | build=0 commit="unknown" tid="140551136841728" timestamp=1742594944
+Mar 21 15:09:04 launchpad ollama[538108]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140551136841728" timestamp=1742594944 total_threads=16
+Mar 21 15:09:04 launchpad ollama[538108]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36369" tid="140551136841728" timestamp=1742594944
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - type  f32:   65 tensors
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - type q4_0:  225 tensors
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - type q6_K:    1 tensors
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.499-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_vocab: special tokens cache size = 256
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: arch             = llama
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: vocab type       = BPE
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_vocab          = 128256
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_merges         = 280147
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: vocab_only       = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_embd           = 4096
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_layer          = 32
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_head           = 32
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_head_kv        = 8
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_rot            = 128
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_swa            = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_gqa            = 4
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_ff             = 14336
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_expert         = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_expert_used    = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: causal attn      = 1
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: pooling type     = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: rope type        = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: rope scaling     = linear
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: freq_scale_train = 1
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: ssm_d_state      = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: model type       = 8B
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: model ftype      = Q4_0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: model params     = 8.03 B
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: max token length = 256
+Mar 21 15:09:04 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 21 15:09:04 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 21 15:09:04 launchpad ollama[1530]: ggml_cuda_init: found 1 CUDA devices:
+Mar 21 15:09:04 launchpad ollama[1530]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: n_ctx      = 2048
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: n_batch    = 512
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: n_ubatch   = 512
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: flash_attn = 0
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: freq_scale = 1
+Mar 21 15:09:05 launchpad ollama[1530]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: graph nodes  = 1030
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: graph splits = 2
+Mar 21 15:09:05 launchpad ollama[538108]: INFO [main] model loaded | tid="140551136841728" timestamp=1742594945
+Mar 21 15:09:05 launchpad ollama[1530]: time=2025-03-21T15:09:05.503-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 21 15:09:13 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:09:13 | 200 |  8.996825431s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:09:13 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:09:13 | 200 |  594.785245ms |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:09:16 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:09:16 | 200 |  2.606560411s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:10:27 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:10:27 | 200 |      34.064µs |       127.0.0.1 | GET      "/api/version"
+Mar 21 15:21:25 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:21:25 | 200 |      521.38µs |       127.0.0.1 | GET      "/api/tags"
+Mar 21 15:22:03 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:22:03 | 200 |     563.204µs |       127.0.0.1 | GET      "/api/tags"
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.036-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=6561005568 required="5.1 GiB"
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.036-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="40.5 GiB" free_swap="68.9 GiB"
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.036-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[6.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="5.1 GiB" memory.required.partial="5.1 GiB" memory.required.kv="256.0 MiB" memory.required.allocations="[5.1 GiB]" memory.weights.total="3.9 GiB" memory.weights.repeating="3.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="258.5 MiB" memory.graph.partial="677.5 MiB"
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.037-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3643862997/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 37513"
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.037-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.037-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.037-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 21 15:22:19 launchpad ollama[540099]: INFO [main] build info | build=0 commit="unknown" tid="140372910104576" timestamp=1742595739
+Mar 21 15:22:19 launchpad ollama[540099]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140372910104576" timestamp=1742595739 total_threads=16
+Mar 21 15:22:19 launchpad ollama[540099]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37513" tid="140372910104576" timestamp=1742595739
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - type  f32:   65 tensors
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - type q4_0:  225 tensors
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - type q6_K:    1 tensors
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.289-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_vocab: special tokens cache size = 256
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: arch             = llama
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: vocab type       = BPE
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_vocab          = 128256
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_merges         = 280147
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: vocab_only       = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_embd           = 4096
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_layer          = 32
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_head           = 32
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_head_kv        = 8
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_rot            = 128
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_swa            = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_gqa            = 4
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_ff             = 14336
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_expert         = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_expert_used    = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: causal attn      = 1
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: pooling type     = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: rope type        = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: rope scaling     = linear
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: freq_scale_train = 1
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: ssm_d_state      = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: model type       = 8B
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: model ftype      = Q4_0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: model params     = 8.03 B
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: max token length = 256
+Mar 21 15:22:19 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 21 15:22:19 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 21 15:22:19 launchpad ollama[1530]: ggml_cuda_init: found 1 CUDA devices:
+Mar 21 15:22:19 launchpad ollama[1530]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 21 15:22:19 launchpad ollama[1530]: llama_new_context_with_model: n_ctx      = 2048
+Mar 21 15:22:19 launchpad ollama[1530]: llama_new_context_with_model: n_batch    = 512
+Mar 21 15:22:19 launchpad ollama[1530]: llama_new_context_with_model: n_ubatch   = 512
+Mar 21 15:22:19 launchpad ollama[1530]: llama_new_context_with_model: flash_attn = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 21 15:22:19 launchpad ollama[1530]: llama_new_context_with_model: freq_scale = 1
+Mar 21 15:22:20 launchpad ollama[1530]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Mar 21 15:22:20 launchpad ollama[1530]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Mar 21 15:22:20 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 21 15:22:20 launchpad ollama[1530]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Mar 21 15:22:20 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Mar 21 15:22:20 launchpad ollama[1530]: llama_new_context_with_model: graph nodes  = 1030
+Mar 21 15:22:20 launchpad ollama[1530]: llama_new_context_with_model: graph splits = 2
+Mar 21 15:22:20 launchpad ollama[540099]: INFO [main] model loaded | tid="140372910104576" timestamp=1742595740
+Mar 21 15:22:20 launchpad ollama[1530]: time=2025-03-21T15:22:20.292-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 21 15:22:29 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:22:29 | 200 | 10.384552038s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:22:29 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:22:29 | 200 |   673.66211ms |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:22:32 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:22:32 | 200 |  2.666591652s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:24:23 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:24:23 | 200 |  9.822015206s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:24:24 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:24:24 | 200 |  676.470508ms |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:24:27 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:24:27 | 200 |   3.13406576s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 17:02:44 launchpad ollama[1530]: [GIN] 2025/03/21 - 17:02:44 | 200 |     555.604µs |       127.0.0.1 | GET      "/api/tags"
+Mar 21 17:02:45 launchpad ollama[1530]: [GIN] 2025/03/21 - 17:02:45 | 200 |      43.831µs |       127.0.0.1 | GET      "/api/version"
+Mar 22 11:48:56 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 22 11:48:56 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 22 11:48:56 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 22 11:48:56 launchpad systemd[1]: ollama.service: Consumed 2min 32.909s CPU time, 5.4G memory peak, 48.9M memory swap peak, 4.6G read from disk, 557M written to disk, 487.4K incoming IP traffic, 677.3K outgoing IP traffic.
+-- Boot 87d76a808f644c6e8843c8f6c36724e1 --
+Mar 22 11:49:48 launchpad systemd[1]: Starting Server for local large language models...
+Mar 22 11:49:48 launchpad systemd[1]: Started Server for local large language models.
+Mar 22 11:49:48 launchpad ollama[1530]: 2025/03/22 11:49:48 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 22 11:49:48 launchpad ollama[1530]: time=2025-03-22T11:49:48.898-07:00 level=INFO source=images.go:753 msg="total blobs: 25"
+Mar 22 11:49:48 launchpad ollama[1530]: time=2025-03-22T11:49:48.907-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 22 11:49:48 launchpad ollama[1530]: time=2025-03-22T11:49:48.910-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 22 11:49:48 launchpad ollama[1530]: time=2025-03-22T11:49:48.911-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2384678814/runners
+Mar 22 11:49:51 launchpad ollama[1530]: time=2025-03-22T11:49:51.928-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 22 11:49:51 launchpad ollama[1530]: time=2025-03-22T11:49:51.929-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 22 11:49:51 launchpad ollama[1530]: time=2025-03-22T11:49:51.929-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 22 11:49:51 launchpad ollama[1530]: time=2025-03-22T11:49:51.930-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 22 11:49:51 launchpad ollama[1530]: time=2025-03-22T11:49:51.930-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 22 11:49:52 launchpad ollama[1530]: time=2025-03-22T11:49:52.164-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 22 12:05:09 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 22 12:05:09 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 22 12:05:09 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 22 12:05:09 launchpad systemd[1]: ollama.service: Consumed 3.420s CPU time, 786.6M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot d80e719555084707a7104971b1b682af --
+Mar 22 12:05:40 launchpad systemd[1]: Starting Server for local large language models...
+Mar 22 12:05:40 launchpad systemd[1]: Started Server for local large language models.
+Mar 22 12:05:40 launchpad ollama[1526]: 2025/03/22 12:05:40 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 22 12:05:40 launchpad ollama[1526]: time=2025-03-22T12:05:40.978-07:00 level=INFO source=images.go:753 msg="total blobs: 25"
+Mar 22 12:05:40 launchpad ollama[1526]: time=2025-03-22T12:05:40.985-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 22 12:05:40 launchpad ollama[1526]: time=2025-03-22T12:05:40.987-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 22 12:05:40 launchpad ollama[1526]: time=2025-03-22T12:05:40.988-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1402874388/runners
+Mar 22 12:05:44 launchpad ollama[1526]: time=2025-03-22T12:05:44.011-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 22 12:05:44 launchpad ollama[1526]: time=2025-03-22T12:05:44.012-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 22 12:05:44 launchpad ollama[1526]: time=2025-03-22T12:05:44.012-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 22 12:05:44 launchpad ollama[1526]: time=2025-03-22T12:05:44.012-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 22 12:05:44 launchpad ollama[1526]: time=2025-03-22T12:05:44.012-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 22 12:05:44 launchpad ollama[1526]: time=2025-03-22T12:05:44.252-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 22 18:32:07 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:32:07 | 200 |    3.125817ms |       127.0.0.1 | GET      "/api/tags"
+Mar 22 18:32:08 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:32:08 | 200 |     479.492µs |       127.0.0.1 | GET      "/api/tags"
+Mar 22 18:32:08 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:32:08 | 200 |     361.952µs |       127.0.0.1 | GET      "/api/version"
+Mar 22 18:32:17 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:32:17 | 200 |       24.56µs |       127.0.0.1 | GET      "/api/version"
+Mar 22 18:36:05 launchpad ollama[1526]: time=2025-03-22T18:36:05.995-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="50.2 GiB" free_swap="68.9 GiB"
+Mar 22 18:36:05 launchpad ollama[1526]: time=2025-03-22T18:36:05.995-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=31 layers.split="" memory.available="[7.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.8 GiB" memory.required.partial="7.2 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.2 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 22 18:36:05 launchpad ollama[1526]: time=2025-03-22T18:36:05.997-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1402874388/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 31 --parallel 1 --port 32777"
+Mar 22 18:36:05 launchpad ollama[1526]: time=2025-03-22T18:36:05.997-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 22 18:36:05 launchpad ollama[1526]: time=2025-03-22T18:36:05.997-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 22 18:36:05 launchpad ollama[1526]: time=2025-03-22T18:36:05.997-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 22 18:36:06 launchpad ollama[63432]: INFO [main] build info | build=0 commit="unknown" tid="140560787623936" timestamp=1742693766
+Mar 22 18:36:06 launchpad ollama[63432]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140560787623936" timestamp=1742693766 total_threads=16
+Mar 22 18:36:06 launchpad ollama[63432]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32777" tid="140560787623936" timestamp=1742693766
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - type  f32:   65 tensors
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - type q4_0:  225 tensors
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - type q6_K:    1 tensors
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_vocab: special tokens cache size = 256
+Mar 22 18:36:06 launchpad ollama[1526]: time=2025-03-22T18:36:06.248-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: arch             = llama
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: vocab type       = BPE
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_vocab          = 128256
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_merges         = 280147
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: vocab_only       = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_embd           = 4096
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_layer          = 32
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_head           = 32
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_head_kv        = 8
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_rot            = 128
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_swa            = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_gqa            = 4
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_ff             = 14336
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_expert         = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_expert_used    = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: causal attn      = 1
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: pooling type     = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: rope type        = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: rope scaling     = linear
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: freq_scale_train = 1
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: ssm_d_state      = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: model type       = 8B
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: model ftype      = Q4_0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: model params     = 8.03 B
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: max token length = 256
+Mar 22 18:36:06 launchpad ollama[1526]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 22 18:36:06 launchpad ollama[1526]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 22 18:36:06 launchpad ollama[1526]: ggml_cuda_init: found 1 CUDA devices:
+Mar 22 18:36:06 launchpad ollama[1526]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llm_load_tensors: offloading 31 repeating layers to GPU
+Mar 22 18:36:11 launchpad ollama[1526]: llm_load_tensors: offloaded 31/33 layers to GPU
+Mar 22 18:36:11 launchpad ollama[1526]: llm_load_tensors:        CPU buffer size =  4437.80 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llm_load_tensors:      CUDA0 buffer size =  3627.97 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: n_ctx      = 16384
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: n_batch    = 512
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: n_ubatch   = 512
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: flash_attn = 0
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: freq_scale = 1
+Mar 22 18:36:11 launchpad ollama[1526]: llama_kv_cache_init:  CUDA_Host KV buffer size =    64.00 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_kv_cache_init:      CUDA0 KV buffer size =  1984.00 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model:      CUDA0 compute buffer size =  1145.00 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: graph nodes  = 1030
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: graph splits = 15
+Mar 22 18:36:12 launchpad ollama[63432]: INFO [main] model loaded | tid="140560787623936" timestamp=1742693772
+Mar 22 18:36:12 launchpad ollama[1526]: time=2025-03-22T18:36:12.266-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Mar 22 18:36:31 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:36:31 | 200 | 25.756314363s |       127.0.0.1 | POST     "/api/chat"
+Mar 22 18:36:32 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:36:32 | 200 |  1.130742083s |       127.0.0.1 | POST     "/api/chat"
+Mar 22 18:36:55 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:36:55 | 200 | 22.246422296s |       127.0.0.1 | POST     "/api/chat"
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.111-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="50.1 GiB" free_swap="68.9 GiB"
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.111-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=31 layers.split="" memory.available="[7.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.8 GiB" memory.required.partial="7.2 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.2 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.112-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1402874388/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 31 --parallel 1 --port 42175"
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.112-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.112-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.112-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 22 18:42:59 launchpad ollama[81038]: INFO [main] build info | build=0 commit="unknown" tid="139826444582912" timestamp=1742694179
+Mar 22 18:42:59 launchpad ollama[81038]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139826444582912" timestamp=1742694179 total_threads=16
+Mar 22 18:42:59 launchpad ollama[81038]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42175" tid="139826444582912" timestamp=1742694179
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - type  f32:   65 tensors
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - type q4_0:  225 tensors
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - type q6_K:    1 tensors
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_vocab: special tokens cache size = 256
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.363-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: arch             = llama
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: vocab type       = BPE
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_vocab          = 128256
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_merges         = 280147
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: vocab_only       = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_embd           = 4096
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_layer          = 32
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_head           = 32
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_head_kv        = 8
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_rot            = 128
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_swa            = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_gqa            = 4
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_ff             = 14336
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_expert         = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_expert_used    = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: causal attn      = 1
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: pooling type     = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: rope type        = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: rope scaling     = linear
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: freq_scale_train = 1
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: ssm_d_state      = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: model type       = 8B
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: model ftype      = Q4_0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: model params     = 8.03 B
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: max token length = 256
+Mar 22 18:42:59 launchpad ollama[1526]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 22 18:42:59 launchpad ollama[1526]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 22 18:42:59 launchpad ollama[1526]: ggml_cuda_init: found 1 CUDA devices:
+Mar 22 18:42:59 launchpad ollama[1526]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_tensors: offloading 31 repeating layers to GPU
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_tensors: offloaded 31/33 layers to GPU
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_tensors:        CPU buffer size =  4437.80 MiB
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_tensors:      CUDA0 buffer size =  3627.97 MiB
+Mar 22 18:42:59 launchpad ollama[1526]: llama_new_context_with_model: n_ctx      = 16384
+Mar 22 18:42:59 launchpad ollama[1526]: llama_new_context_with_model: n_batch    = 512
+Mar 22 18:42:59 launchpad ollama[1526]: llama_new_context_with_model: n_ubatch   = 512
+Mar 22 18:42:59 launchpad ollama[1526]: llama_new_context_with_model: flash_attn = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 22 18:42:59 launchpad ollama[1526]: llama_new_context_with_model: freq_scale = 1
+Mar 22 18:43:00 launchpad ollama[1526]: llama_kv_cache_init:  CUDA_Host KV buffer size =    64.00 MiB
+Mar 22 18:43:00 launchpad ollama[1526]: llama_kv_cache_init:      CUDA0 KV buffer size =  1984.00 MiB
+Mar 22 18:43:00 launchpad ollama[1526]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 22 18:43:00 launchpad ollama[1526]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 22 18:43:00 launchpad ollama[1526]: llama_new_context_with_model:      CUDA0 compute buffer size =  1145.00 MiB
+Mar 22 18:43:00 launchpad ollama[1526]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 22 18:43:00 launchpad ollama[1526]: llama_new_context_with_model: graph nodes  = 1030
+Mar 22 18:43:00 launchpad ollama[1526]: llama_new_context_with_model: graph splits = 15
+Mar 22 18:43:00 launchpad ollama[81038]: INFO [main] model loaded | tid="139826444582912" timestamp=1742694180
+Mar 22 18:43:00 launchpad ollama[1526]: time=2025-03-22T18:43:00.124-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 22 19:01:30 launchpad ollama[1526]: [GIN] 2025/03/22 - 19:01:30 | 200 |        18m32s |       127.0.0.1 | POST     "/api/chat"
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.627-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.8 GiB" free_swap="68.9 GiB"
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.628-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=31 layers.split="" memory.available="[7.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.8 GiB" memory.required.partial="7.2 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.2 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.629-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1402874388/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 31 --parallel 1 --port 45027"
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.629-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.629-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.629-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 23 11:26:08 launchpad ollama[354272]: INFO [main] build info | build=0 commit="unknown" tid="139888281346048" timestamp=1742754368
+Mar 23 11:26:08 launchpad ollama[354272]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139888281346048" timestamp=1742754368 total_threads=16
+Mar 23 11:26:08 launchpad ollama[354272]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45027" tid="139888281346048" timestamp=1742754368
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - type  f32:   65 tensors
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - type q4_0:  225 tensors
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - type q6_K:    1 tensors
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_vocab: special tokens cache size = 256
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.880-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: arch             = llama
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: vocab type       = BPE
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_vocab          = 128256
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_merges         = 280147
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: vocab_only       = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_embd           = 4096
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_layer          = 32
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_head           = 32
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_head_kv        = 8
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_rot            = 128
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_swa            = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_gqa            = 4
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_ff             = 14336
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_expert         = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_expert_used    = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: causal attn      = 1
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: pooling type     = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: rope type        = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: rope scaling     = linear
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: freq_scale_train = 1
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: ssm_d_state      = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: model type       = 8B
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: model ftype      = Q4_0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: model params     = 8.03 B
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: max token length = 256
+Mar 23 11:26:08 launchpad ollama[1526]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 23 11:26:08 launchpad ollama[1526]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 23 11:26:08 launchpad ollama[1526]: ggml_cuda_init: found 1 CUDA devices:
+Mar 23 11:26:08 launchpad ollama[1526]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 23 11:26:09 launchpad ollama[1526]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llm_load_tensors: offloading 31 repeating layers to GPU
+Mar 23 11:26:09 launchpad ollama[1526]: llm_load_tensors: offloaded 31/33 layers to GPU
+Mar 23 11:26:09 launchpad ollama[1526]: llm_load_tensors:        CPU buffer size =  4437.80 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llm_load_tensors:      CUDA0 buffer size =  3627.97 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: n_ctx      = 16384
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: n_batch    = 512
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: n_ubatch   = 512
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: flash_attn = 0
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: freq_scale = 1
+Mar 23 11:26:09 launchpad ollama[1526]: llama_kv_cache_init:  CUDA_Host KV buffer size =    64.00 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_kv_cache_init:      CUDA0 KV buffer size =  1984.00 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model:      CUDA0 compute buffer size =  1145.00 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: graph nodes  = 1030
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: graph splits = 15
+Mar 23 11:26:09 launchpad ollama[354272]: INFO [main] model loaded | tid="139888281346048" timestamp=1742754369
+Mar 23 11:26:09 launchpad ollama[1526]: time=2025-03-23T11:26:09.633-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 23 11:26:12 launchpad ollama[1526]: [GIN] 2025/03/23 - 11:26:12 | 200 |  4.108947061s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 11:47:25 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 29 11:47:25 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 29 11:47:25 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 29 11:47:25 launchpad systemd[1]: ollama.service: Consumed 1h 24min 53.790s CPU time, 5.5G memory peak, 4.5G read from disk, 508.1M written to disk, 4.8M incoming IP traffic, 7.2M outgoing IP traffic.
+-- Boot bf78be1ece36430ea50ae36db80fd028 --
+Mar 29 11:47:58 launchpad systemd[1]: Starting Server for local large language models...
+Mar 29 11:47:58 launchpad systemd[1]: Started Server for local large language models.
+Mar 29 11:47:58 launchpad ollama[1512]: 2025/03/29 11:47:58 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 29 11:47:58 launchpad ollama[1512]: time=2025-03-29T11:47:58.329-07:00 level=INFO source=images.go:753 msg="total blobs: 25"
+Mar 29 11:47:58 launchpad ollama[1512]: time=2025-03-29T11:47:58.339-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 29 11:47:58 launchpad ollama[1512]: time=2025-03-29T11:47:58.341-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 29 11:47:58 launchpad ollama[1512]: time=2025-03-29T11:47:58.344-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2352317378/runners
+Mar 29 11:48:01 launchpad ollama[1512]: time=2025-03-29T11:48:01.346-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 29 11:48:01 launchpad ollama[1512]: time=2025-03-29T11:48:01.347-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 29 11:48:01 launchpad ollama[1512]: time=2025-03-29T11:48:01.347-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 29 11:48:01 launchpad ollama[1512]: time=2025-03-29T11:48:01.347-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 29 11:48:01 launchpad ollama[1512]: time=2025-03-29T11:48:01.347-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 29 11:48:01 launchpad ollama[1512]: time=2025-03-29T11:48:01.624-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.3 GiB"
+Mar 29 11:49:26 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 29 11:49:26 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 29 11:49:26 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 29 11:49:26 launchpad systemd[1]: ollama.service: Consumed 3.432s CPU time, 787M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot cf2f988c42ce41d9aeb1a1f4e09a64af --
+Mar 29 11:49:57 launchpad systemd[1]: Starting Server for local large language models...
+Mar 29 11:49:57 launchpad systemd[1]: Started Server for local large language models.
+Mar 29 11:49:58 launchpad ollama[1510]: 2025/03/29 11:49:58 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 29 11:49:58 launchpad ollama[1510]: time=2025-03-29T11:49:58.011-07:00 level=INFO source=images.go:753 msg="total blobs: 25"
+Mar 29 11:49:58 launchpad ollama[1510]: time=2025-03-29T11:49:58.020-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 29 11:49:58 launchpad ollama[1510]: time=2025-03-29T11:49:58.022-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 29 11:49:58 launchpad ollama[1510]: time=2025-03-29T11:49:58.024-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1630742596/runners
+Mar 29 11:50:01 launchpad ollama[1510]: time=2025-03-29T11:50:01.071-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 29 11:50:01 launchpad ollama[1510]: time=2025-03-29T11:50:01.072-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 29 11:50:01 launchpad ollama[1510]: time=2025-03-29T11:50:01.072-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 29 11:50:01 launchpad ollama[1510]: time=2025-03-29T11:50:01.073-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 29 11:50:01 launchpad ollama[1510]: time=2025-03-29T11:50:01.073-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 29 11:50:01 launchpad ollama[1510]: time=2025-03-29T11:50:01.301-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 29 12:06:57 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:06:57 | 200 |     909.428µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:07:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:03 | 200 |      46.264µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 29 12:07:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:03 | 200 |   26.073261ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 12:07:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:11 | 200 |      15.595µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:07:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:11 | 200 |    3.796966ms |       127.0.0.1 | GET      "/api/tags"
+Mar 29 12:07:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:31 | 200 |      25.537µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:07:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:31 | 200 |    2.925078ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 29 12:07:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:40 | 200 |      21.429µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:07:42 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:42 | 200 |      56.883µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 29 12:07:42 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:42 | 200 |   26.024146ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 12:07:45 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:45 | 200 |      15.269µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:07:45 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:45 | 200 |     496.543µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 12:11:45 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:11:45 | 200 |     423.031µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 12:11:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:11:46 | 200 |     457.267µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 12:11:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:11:46 | 200 |      31.581µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 12:12:06 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:12:06 | 200 |      48.021µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.934-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10215030784 required="7.7 GiB"
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.935-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.935-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.936-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 40787"
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.936-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.936-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.937-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 12:12:18 launchpad ollama[8895]: INFO [main] build info | build=0 commit="unknown" tid="139713714561024" timestamp=1743275538
+Mar 29 12:12:18 launchpad ollama[8895]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139713714561024" timestamp=1743275538 total_threads=16
+Mar 29 12:12:18 launchpad ollama[8895]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40787" tid="139713714561024" timestamp=1743275538
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 12:12:19 launchpad ollama[1510]: time=2025-03-29T12:12:19.187-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 12:12:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 12:12:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 12:12:19 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 12:12:19 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 12:12:19 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 12:12:20 launchpad ollama[8895]: INFO [main] model loaded | tid="139713714561024" timestamp=1743275540
+Mar 29 12:12:20 launchpad ollama[1510]: time=2025-03-29T12:12:20.191-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 12:12:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:12:23 | 200 |  4.897748354s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:12:24 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:12:24 | 200 |   637.32604ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:12:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:12:27 | 200 |   3.19952118s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:13:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:13:14 | 200 |  4.952539512s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:15:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:15:18 | 200 |  6.665898331s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:17:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:17:09 | 200 |  5.759665031s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:18:52 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:18:52 | 200 |  7.221153274s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:20:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:20:30 | 200 |  6.551018819s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:23:24 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:23:24 | 200 |  4.737337823s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:25:07 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:25:07 | 200 |  3.491567625s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:26:15 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:26:15 | 200 |  7.677028492s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:28:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:28:38 | 200 | 11.999277099s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:33:45 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:33:45 | 200 | 10.595503222s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:35:06 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:35:06 | 200 | 11.251535252s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.813-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10224271360 required="7.7 GiB"
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.813-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.4 GiB" free_swap="68.9 GiB"
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.813-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.814-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 46297"
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.814-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.814-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.814-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 12:42:13 launchpad ollama[13652]: INFO [main] build info | build=0 commit="unknown" tid="140134671540224" timestamp=1743277333
+Mar 29 12:42:13 launchpad ollama[13652]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140134671540224" timestamp=1743277333 total_threads=16
+Mar 29 12:42:13 launchpad ollama[13652]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46297" tid="140134671540224" timestamp=1743277333
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 12:42:14 launchpad ollama[1510]: time=2025-03-29T12:42:14.066-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 12:42:14 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 12:42:14 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 12:42:14 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 12:42:14 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 12:42:14 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 12:42:14 launchpad ollama[13652]: INFO [main] model loaded | tid="140134671540224" timestamp=1743277334
+Mar 29 12:42:15 launchpad ollama[1510]: time=2025-03-29T12:42:15.069-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 12:42:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:42:26 | 200 | 12.993540226s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:43:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:43:11 | 200 |  8.164448502s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:43:58 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:43:58 | 200 |      15.023µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:44:01 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:44:01 | 200 |      45.372µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 29 12:44:01 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:44:01 | 200 |   24.951458ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 12:44:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:44:14 | 200 |      16.158µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:44:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:44:14 | 200 |      67.418µs |       127.0.0.1 | GET      "/api/ps"
+Mar 29 12:44:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:44:36 | 200 |      17.009µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:44:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:44:36 | 200 |      431.12µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 12:45:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:12 | 200 |       15.25µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:45:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:12 | 200 |     350.332µs |       127.0.0.1 | POST     "/api/generate"
+Mar 29 12:45:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:16 | 200 |      15.152µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:45:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:16 | 200 |       7.923µs |       127.0.0.1 | GET      "/api/ps"
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.092-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10264117248 required="7.7 GiB"
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.092-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.3 GiB" free_swap="68.9 GiB"
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.093-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.093-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 43959"
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.094-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.094-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.094-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 12:45:27 launchpad ollama[14407]: INFO [main] build info | build=0 commit="unknown" tid="140403551617024" timestamp=1743277527
+Mar 29 12:45:27 launchpad ollama[14407]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140403551617024" timestamp=1743277527 total_threads=16
+Mar 29 12:45:27 launchpad ollama[14407]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43959" tid="140403551617024" timestamp=1743277527
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.345-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 12:45:27 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 12:45:27 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 12:45:27 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 12:45:27 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 12:45:28 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 12:45:28 launchpad ollama[14407]: INFO [main] model loaded | tid="140403551617024" timestamp=1743277528
+Mar 29 12:45:28 launchpad ollama[1510]: time=2025-03-29T12:45:28.349-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 12:45:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:31 | 200 |      17.212µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:45:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:31 | 200 |      18.745µs |       127.0.0.1 | GET      "/api/ps"
+Mar 29 12:45:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:40 | 200 | 13.878557309s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:46:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:46:17 | 200 |  4.925003725s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:47:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:47:30 | 200 |      19.517µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:47:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:47:30 | 200 |   12.998916ms |       127.0.0.1 | POST     "/api/show"
+Mar 29 12:48:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:48:23 | 200 |      25.343µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 12:48:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:48:27 | 200 |      25.454µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 12:49:15 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:49:15 | 200 |  5.394766361s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:49:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:49:18 | 200 |  7.810179363s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:50:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:50:33 | 200 |  7.107549148s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:51:41 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:51:41 | 200 |      14.957µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:51:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:51:43 | 200 |      49.621µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 29 12:51:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:51:43 | 200 |   25.094074ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 12:51:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:51:46 | 200 |      15.517µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:51:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:51:46 | 200 |   13.371083ms |       127.0.0.1 | POST     "/api/show"
+Mar 29 12:52:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:52:14 | 200 | 11.649019777s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:53:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:53:18 | 200 |  5.781659817s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:54:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:54:26 | 200 |  5.181279914s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:55:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:55:11 | 200 |  5.992279574s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:56:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:56:43 | 200 |  9.127719573s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:57:52 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:57:52 | 200 | 12.860902242s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:59:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:59:03 | 200 |  8.892812668s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:00:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:00:23 | 200 | 10.323785591s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.990-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10266214400 required="7.7 GiB"
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.990-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.5 GiB" free_swap="68.9 GiB"
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.990-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.991-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 40401"
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.991-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.991-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.991-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 13:05:40 launchpad ollama[17589]: INFO [main] build info | build=0 commit="unknown" tid="140078037950464" timestamp=1743278740
+Mar 29 13:05:40 launchpad ollama[17589]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140078037950464" timestamp=1743278740 total_threads=16
+Mar 29 13:05:40 launchpad ollama[17589]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40401" tid="140078037950464" timestamp=1743278740
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 13:05:40 launchpad ollama[1510]: time=2025-03-29T13:05:40.242-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 13:05:40 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 13:05:40 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 13:05:40 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 13:05:40 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 13:05:40 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 13:05:41 launchpad ollama[17589]: INFO [main] model loaded | tid="140078037950464" timestamp=1743278741
+Mar 29 13:05:41 launchpad ollama[1510]: time=2025-03-29T13:05:41.247-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 13:05:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:05:59 | 200 | 19.995767094s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:07:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:07:26 | 200 |  8.150352463s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:08:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:08:03 | 200 | 10.703657875s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:09:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:09:36 | 200 |  9.706439772s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:10:55 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:10:55 | 200 |  4.229398164s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:11:50 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:11:50 | 200 |  3.435489866s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:12:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:12:03 | 200 |  3.636389468s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:12:22 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:12:22 | 200 |  4.237886728s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:12:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:12:34 | 200 |  3.627417048s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:12:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:12:49 | 200 |  3.926222454s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:13:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:13:03 | 200 |  4.159111102s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:14:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:14:20 | 200 |  3.896558089s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:14:44 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:14:44 | 200 |      15.207µs |       127.0.0.1 | HEAD     "/"
+Mar 29 13:14:44 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:14:44 | 200 |     265.723µs |       127.0.0.1 | POST     "/api/generate"
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.252-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10247340032 required="7.7 GiB"
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.252-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.5 GiB" free_swap="68.9 GiB"
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.253-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.254-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 36405"
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.254-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.254-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.254-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 13:14:48 launchpad ollama[18950]: INFO [main] build info | build=0 commit="unknown" tid="140561416552448" timestamp=1743279288
+Mar 29 13:14:48 launchpad ollama[18950]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140561416552448" timestamp=1743279288 total_threads=16
+Mar 29 13:14:48 launchpad ollama[18950]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36405" tid="140561416552448" timestamp=1743279288
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.505-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 13:14:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 13:14:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 13:14:48 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 13:14:48 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 13:14:49 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 13:14:49 launchpad ollama[18950]: INFO [main] model loaded | tid="140561416552448" timestamp=1743279289
+Mar 29 13:14:49 launchpad ollama[1510]: time=2025-03-29T13:14:49.508-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 13:15:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:15:03 | 200 | 15.600965044s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:15:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:15:14 | 200 |  3.858068645s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:15:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:15:20 | 200 |  1.589176928s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:15:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:15:30 | 200 | 14.443002461s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:16:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:16:17 | 200 |  4.031074324s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:18:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:18:14 | 200 |  9.958618766s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:18:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:18:14 | 200 |  803.635398ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:18:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:18:16 | 200 |    1.2515995s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:23:16 launchpad ollama[1510]: cuda driver library failed to get device context 2time=2025-03-29T13:23:16.207-07:00 level=WARN source=gpu.go:400 msg="error looking up nvidia GPU memory"
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.613-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9938206720 required="7.7 GiB"
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.613-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="46.5 GiB" free_swap="68.9 GiB"
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.613-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.614-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 39459"
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.614-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.615-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.615-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 13:27:09 launchpad ollama[22700]: INFO [main] build info | build=0 commit="unknown" tid="140245866528768" timestamp=1743280029
+Mar 29 13:27:09 launchpad ollama[22700]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140245866528768" timestamp=1743280029 total_threads=16
+Mar 29 13:27:09 launchpad ollama[22700]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39459" tid="140245866528768" timestamp=1743280029
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.866-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 13:27:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 13:27:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 13:27:09 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 13:27:09 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 13:27:10 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 13:27:10 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 13:27:10 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 13:27:10 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 13:27:10 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 13:27:10 launchpad ollama[22700]: INFO [main] model loaded | tid="140245866528768" timestamp=1743280030
+Mar 29 13:27:10 launchpad ollama[1510]: time=2025-03-29T13:27:10.869-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 13:27:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:27:16 | 200 |  7.512050236s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:32:17 launchpad ollama[1510]: cuda driver library failed to get device context 2time=2025-03-29T13:32:17.056-07:00 level=WARN source=gpu.go:400 msg="error looking up nvidia GPU memory"
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.761-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9974120448 required="7.7 GiB"
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.761-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="45.0 GiB" free_swap="68.9 GiB"
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.762-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.763-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 39893"
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.763-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.763-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.763-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 13:34:09 launchpad ollama[24583]: INFO [main] build info | build=0 commit="unknown" tid="140205579079680" timestamp=1743280449
+Mar 29 13:34:09 launchpad ollama[24583]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140205579079680" timestamp=1743280449 total_threads=16
+Mar 29 13:34:09 launchpad ollama[24583]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39893" tid="140205579079680" timestamp=1743280449
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 13:34:10 launchpad ollama[1510]: time=2025-03-29T13:34:10.015-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 13:34:10 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 13:34:10 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 13:34:10 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 13:34:10 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 13:34:10 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 13:34:10 launchpad ollama[24583]: INFO [main] model loaded | tid="140205579079680" timestamp=1743280450
+Mar 29 13:34:11 launchpad ollama[1510]: time=2025-03-29T13:34:11.018-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 13:34:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:34:17 | 200 |  7.682613172s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:36:08 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:36:08 | 200 |      15.045µs |       127.0.0.1 | HEAD     "/"
+Mar 29 13:36:11 launchpad ollama[1510]: time=2025-03-29T13:36:11.365-07:00 level=INFO source=download.go:175 msg="downloading 6340dc3229b0 in 16 307 MB part(s)"
+Mar 29 13:38:11 launchpad ollama[1510]: time=2025-03-29T13:38:11.275-07:00 level=INFO source=download.go:175 msg="downloading 369ca498f347 in 1 387 B part(s)"
+Mar 29 13:38:12 launchpad ollama[1510]: time=2025-03-29T13:38:12.797-07:00 level=INFO source=download.go:175 msg="downloading 6e4c38e1172f in 1 1.1 KB part(s)"
+Mar 29 13:38:14 launchpad ollama[1510]: time=2025-03-29T13:38:14.177-07:00 level=INFO source=download.go:175 msg="downloading f4d24e9138dd in 1 148 B part(s)"
+Mar 29 13:38:15 launchpad ollama[1510]: time=2025-03-29T13:38:15.486-07:00 level=INFO source=download.go:175 msg="downloading 0cb05c6e4e02 in 1 487 B part(s)"
+Mar 29 13:38:19 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:38:19 | 200 |         2m10s |       127.0.0.1 | POST     "/api/pull"
+Mar 29 13:39:17 launchpad ollama[1510]: cuda driver library failed to get device context 2time=2025-03-29T13:39:17.386-07:00 level=WARN source=gpu.go:400 msg="error looking up nvidia GPU memory"
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.035-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9946005504 required="7.7 GiB"
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.035-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="45.1 GiB" free_swap="68.9 GiB"
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.035-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.036-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 34835"
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.036-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.037-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.037-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 13:55:23 launchpad ollama[30258]: INFO [main] build info | build=0 commit="unknown" tid="139740411109376" timestamp=1743281723
+Mar 29 13:55:23 launchpad ollama[30258]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139740411109376" timestamp=1743281723 total_threads=16
+Mar 29 13:55:23 launchpad ollama[30258]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34835" tid="139740411109376" timestamp=1743281723
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.288-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 13:55:23 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 13:55:23 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 13:55:23 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 13:55:23 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 13:55:23 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 13:55:24 launchpad ollama[30258]: INFO [main] model loaded | tid="139740411109376" timestamp=1743281724
+Mar 29 13:55:24 launchpad ollama[1510]: time=2025-03-29T13:55:24.292-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 13:55:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:55:31 | 200 |   8.59984653s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:57:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:57:17 | 200 |      16.399µs |       127.0.0.1 | HEAD     "/"
+Mar 29 13:57:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:57:17 | 200 |     525.548µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 13:58:07 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:58:07 | 200 |      15.609µs |       127.0.0.1 | HEAD     "/"
+Mar 29 13:58:07 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:58:07 | 200 |   12.559389ms |       127.0.0.1 | POST     "/api/show"
+Mar 29 14:12:56 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:12:56 | 200 |      14.583µs |       127.0.0.1 | HEAD     "/"
+Mar 29 14:12:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:12:59 | 200 |      44.611µs |       127.0.0.1 | POST     "/api/blobs/sha256:6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be"
+Mar 29 14:12:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:12:59 | 200 |   69.446199ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 14:14:01 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:01 | 200 |      15.583µs |       127.0.0.1 | HEAD     "/"
+Mar 29 14:14:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:03 | 200 |      51.094µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 29 14:14:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:03 | 200 |   24.847556ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 14:14:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:20 | 200 |      15.174µs |       127.0.0.1 | HEAD     "/"
+Mar 29 14:14:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:23 | 200 |      43.754µs |       127.0.0.1 | POST     "/api/blobs/sha256:6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be"
+Mar 29 14:14:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:23 | 200 |   69.143041ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 14:14:48 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:48 | 200 |      15.822µs |       127.0.0.1 | HEAD     "/"
+Mar 29 14:14:48 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:48 | 200 |     618.311µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 14:15:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:15:20 | 200 |      26.491µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 14:16:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:16:20 | 200 |     629.155µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 14:16:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:16:20 | 200 |      38.872µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.335-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9934471168 required="6.5 GiB"
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.335-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="45.0 GiB" free_swap="68.9 GiB"
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.335-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.337-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39699"
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.337-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.337-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.337-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 14:18:09 launchpad ollama[36827]: INFO [main] build info | build=0 commit="unknown" tid="140627769352192" timestamp=1743283089
+Mar 29 14:18:09 launchpad ollama[36827]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140627769352192" timestamp=1743283089 total_threads=16
+Mar 29 14:18:09 launchpad ollama[36827]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39699" tid="140627769352192" timestamp=1743283089
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.587-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 14:18:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 14:18:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 14:18:09 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 14:18:09 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 14:18:10 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 14:18:10 launchpad ollama[36827]: INFO [main] model loaded | tid="140627769352192" timestamp=1743283090
+Mar 29 14:18:10 launchpad ollama[1510]: time=2025-03-29T14:18:10.591-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 14:18:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:18:30 | 200 | 21.477310996s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:18:32 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:18:32 | 200 |  1.388959098s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:18:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:18:33 | 200 |  1.547988848s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:23:54 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:23:54 | 200 |      33.383µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.795-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9930014720 required="6.5 GiB"
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.795-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="44.8 GiB" free_swap="68.9 GiB"
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.795-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.797-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34413"
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.797-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.797-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.797-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 14:24:00 launchpad ollama[38425]: INFO [main] build info | build=0 commit="unknown" tid="139833322168320" timestamp=1743283440
+Mar 29 14:24:00 launchpad ollama[38425]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139833322168320" timestamp=1743283440 total_threads=16
+Mar 29 14:24:00 launchpad ollama[38425]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34413" tid="139833322168320" timestamp=1743283440
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 14:24:01 launchpad ollama[1510]: time=2025-03-29T14:24:01.048-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 14:24:01 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 14:24:01 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 14:24:01 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 14:24:01 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 14:24:01 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 14:24:01 launchpad ollama[38425]: INFO [main] model loaded | tid="139833322168320" timestamp=1743283441
+Mar 29 14:24:02 launchpad ollama[1510]: time=2025-03-29T14:24:02.052-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 14:24:21 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:24:21 | 200 | 21.095573632s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:46:25 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:46:25 | 200 |      16.151µs |       127.0.0.1 | HEAD     "/"
+Mar 29 14:46:25 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:46:25 | 200 |      11.351µs |       127.0.0.1 | GET      "/api/ps"
+Mar 29 14:48:21 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:48:21 | 200 |     529.568µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 14:50:37 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:50:37 | 200 |     628.376µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 14:50:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:50:47 | 200 |      28.219µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 14:51:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:51:17 | 200 |      25.524µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 14:51:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:51:36 | 200 |       25.05µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.112-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8687517696 required="6.5 GiB"
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.112-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="43.8 GiB" free_swap="68.9 GiB"
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.112-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.113-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43335"
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.114-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.114-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.114-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 14:52:04 launchpad ollama[46555]: INFO [main] build info | build=0 commit="unknown" tid="139661138759680" timestamp=1743285124
+Mar 29 14:52:04 launchpad ollama[46555]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139661138759680" timestamp=1743285124 total_threads=16
+Mar 29 14:52:04 launchpad ollama[46555]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43335" tid="139661138759680" timestamp=1743285124
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.365-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 14:52:04 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 14:52:04 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 14:52:04 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 14:52:04 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 14:52:05 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 14:52:05 launchpad ollama[46555]: INFO [main] model loaded | tid="139661138759680" timestamp=1743285125
+Mar 29 14:52:05 launchpad ollama[1510]: time=2025-03-29T14:52:05.367-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 14:52:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:52:16 | 200 | 12.229921912s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:52:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:52:16 | 200 |  675.074849ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:52:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:52:17 | 200 |  999.092403ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:57:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:57:11 | 200 |     563.021µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:03:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:03:23 | 200 |     548.489µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:03:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:03:23 | 200 |      29.257µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:04:58 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:04:58 | 200 |      26.645µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:04:58 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:04:58 | 200 |     541.141µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:04:58 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:04:58 | 200 |     969.124µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:05:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:05:11 | 200 |      25.431µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:06:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:06:46 | 200 |     615.367µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:06:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:06:46 | 200 |      29.462µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:08:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:08:23 | 200 |      30.071µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:08:44 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:08:44 | 200 |      26.805µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.780-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8990490624 required="6.5 GiB"
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.780-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="37.2 GiB" free_swap="68.9 GiB"
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.780-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.781-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33433"
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.782-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.782-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.782-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 15:09:16 launchpad ollama[51850]: INFO [main] build info | build=0 commit="unknown" tid="140565074239488" timestamp=1743286156
+Mar 29 15:09:16 launchpad ollama[51850]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140565074239488" timestamp=1743286156 total_threads=16
+Mar 29 15:09:16 launchpad ollama[51850]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33433" tid="140565074239488" timestamp=1743286156
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 15:09:17 launchpad ollama[1510]: time=2025-03-29T15:09:17.032-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 15:09:17 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 15:09:17 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 15:09:17 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 15:09:17 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 15:09:17 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 15:09:17 launchpad ollama[51850]: INFO [main] model loaded | tid="140565074239488" timestamp=1743286157
+Mar 29 15:09:18 launchpad ollama[1510]: time=2025-03-29T15:09:18.036-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 15:09:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:09:18 | 200 |  2.202793672s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:09:39 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:09:39 | 200 | 12.393429208s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:09:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:09:40 | 200 |  767.179528ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:09:41 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:09:41 | 200 |  1.098916675s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:10:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:10:38 | 200 |     575.915µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:10:39 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:10:39 | 200 |      26.345µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:12:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:12:27 | 200 |     578.327µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:12:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:12:31 | 200 |      27.439µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:14:02 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:14:02 | 200 |     522.816µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:14:56 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:14:56 | 200 |      27.209µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.557-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8989310976 required="6.5 GiB"
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.557-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="36.9 GiB" free_swap="68.9 GiB"
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.557-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.558-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 32929"
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.558-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.558-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.559-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 15:15:09 launchpad ollama[53962]: INFO [main] build info | build=0 commit="unknown" tid="139621333716992" timestamp=1743286509
+Mar 29 15:15:09 launchpad ollama[53962]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139621333716992" timestamp=1743286509 total_threads=16
+Mar 29 15:15:09 launchpad ollama[53962]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32929" tid="139621333716992" timestamp=1743286509
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.809-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 15:15:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 15:15:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 15:15:09 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 15:15:09 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 15:15:10 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 15:15:10 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 15:15:10 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 15:15:10 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 15:15:10 launchpad ollama[53962]: INFO [main] model loaded | tid="139621333716992" timestamp=1743286510
+Mar 29 15:15:10 launchpad ollama[1510]: time=2025-03-29T15:15:10.812-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 15:15:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:15:18 | 200 |  8.684809515s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:15:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:15:18 | 200 |  630.645845ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:15:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:15:20 | 200 |  1.754691555s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:16:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:16:40 | 200 |  9.252340304s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:17:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:17:34 | 200 |  8.512406778s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:18:21 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:18:21 | 200 |     550.079µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:18:24 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:18:24 | 200 |      25.383µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:18:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:18:33 | 200 |  4.758009566s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:18:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:18:33 | 200 |  380.581024ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:18:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:18:34 | 200 |  717.507959ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:19:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:19:31 | 200 |  7.114769033s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:20:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:20:09 | 200 |  1.248733669s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:20:22 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:20:22 | 200 | 10.979411906s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:21:10 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:21:10 | 200 |     614.951µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:32:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:32:47 | 200 |     527.521µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:33:10 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:33:10 | 200 |     536.619µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:33:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:33:13 | 200 |      27.247µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.472-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9001500672 required="6.5 GiB"
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.472-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="37.6 GiB" free_swap="68.9 GiB"
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.472-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.473-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38149"
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.473-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.473-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.474-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 15:33:33 launchpad ollama[58902]: INFO [main] build info | build=0 commit="unknown" tid="139883615182848" timestamp=1743287613
+Mar 29 15:33:33 launchpad ollama[58902]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139883615182848" timestamp=1743287613 total_threads=16
+Mar 29 15:33:33 launchpad ollama[58902]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38149" tid="139883615182848" timestamp=1743287613
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.725-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 15:33:33 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 15:33:33 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 15:33:33 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 15:33:33 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 15:33:34 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 15:33:34 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 15:33:34 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 15:33:34 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 15:33:34 launchpad ollama[58902]: INFO [main] model loaded | tid="139883615182848" timestamp=1743287614
+Mar 29 15:33:34 launchpad ollama[1510]: time=2025-03-29T15:33:34.729-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 15:33:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:33:43 | 200 |  9.933743141s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:33:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:33:43 | 200 |  672.981374ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:33:44 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:33:44 | 200 |  883.962566ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:34:41 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:34:41 | 200 |  7.224379028s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:36:42 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:36:42 | 200 |  1.400049248s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:36:55 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:36:55 | 200 |  6.371107151s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:37:15 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:37:15 | 200 |   1.62644083s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:37:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:37:18 | 200 |  2.701827007s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:37:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:37:34 | 200 |  1.607649512s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:37:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:37:38 | 200 |  4.575297353s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:37:54 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:37:54 | 200 |  1.391617428s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:37:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:37:59 | 200 |  5.173918447s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:38:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:38:43 | 200 |  1.299442014s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:38:57 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:38:57 | 200 |  6.476457729s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:40:32 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:40:32 | 200 |  1.263283008s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:40:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:40:47 | 200 | 14.443386837s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:42:08 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:42:08 | 200 |      25.749µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:42:25 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:42:25 | 200 | 13.333347259s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:42:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:42:26 | 200 |  870.899897ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:42:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:42:27 | 200 |  1.233841321s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:45:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:45:40 | 200 |      26.033µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:46:00 launchpad ollama[1510]: time=2025-03-29T15:46:00.821-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.522-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9010544640 required="7.7 GiB"
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.522-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="37.0 GiB" free_swap="68.9 GiB"
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.522-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.523-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 37267"
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.523-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.523-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.524-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 15:46:01 launchpad ollama[62138]: INFO [main] build info | build=0 commit="unknown" tid="139889629814784" timestamp=1743288361
+Mar 29 15:46:01 launchpad ollama[62138]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139889629814784" timestamp=1743288361 total_threads=16
+Mar 29 15:46:01 launchpad ollama[62138]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37267" tid="139889629814784" timestamp=1743288361
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.775-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 15:46:01 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 15:46:01 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 15:46:01 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 15:46:01 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 15:46:02 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 15:46:02 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 15:46:02 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 15:46:02 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 15:46:02 launchpad ollama[62138]: INFO [main] model loaded | tid="139889629814784" timestamp=1743288362
+Mar 29 15:46:02 launchpad ollama[1510]: time=2025-03-29T15:46:02.778-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 15:46:04 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:46:04 | 200 |  3.424421819s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:46:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:46:13 | 200 |  4.879807842s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:46:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:46:49 | 200 |  1.052225209s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:46:54 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:46:54 | 200 |  5.132097154s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:50:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:50:28 | 200 |        26.3µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:50:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:50:38 | 200 |    897.9825ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:50:51 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:50:51 | 200 |  6.948801191s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:52:21 launchpad ollama[1510]: time=2025-03-29T15:52:21.545-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="1.1 GiB"
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.273-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9037217792 required="6.5 GiB"
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.273-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="37.4 GiB" free_swap="68.9 GiB"
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.273-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.274-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37103"
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.274-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.274-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.275-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 15:52:22 launchpad ollama[63870]: INFO [main] build info | build=0 commit="unknown" tid="139973150744576" timestamp=1743288742
+Mar 29 15:52:22 launchpad ollama[63870]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139973150744576" timestamp=1743288742 total_threads=16
+Mar 29 15:52:22 launchpad ollama[63870]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37103" tid="139973150744576" timestamp=1743288742
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.526-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 15:52:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 15:52:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 15:52:22 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 15:52:22 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 15:52:23 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 15:52:23 launchpad ollama[63870]: INFO [main] model loaded | tid="139973150744576" timestamp=1743288743
+Mar 29 15:52:23 launchpad ollama[1510]: time=2025-03-29T15:52:23.528-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 15:52:35 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:52:35 | 200 | 13.962988563s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:53:21 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:53:21 | 200 | 10.415352509s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:54:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:54:17 | 200 |  6.296363593s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:54:44 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:54:44 | 200 |  4.399340247s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:55:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:55:03 | 200 |  6.300933513s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:55:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:55:31 | 200 |     582.645µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:55:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:55:36 | 200 |      26.017µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:55:54 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:55:54 | 200 |  5.422678956s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:55:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:55:59 | 200 |      25.119µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:56:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:56:09 | 200 |  5.536962529s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:56:10 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:56:10 | 200 |  834.192484ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:56:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:56:11 | 200 |  683.826882ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:56:57 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:56:57 | 200 |     533.139µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:57:00 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:57:00 | 200 |      26.765µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:57:04 launchpad ollama[1510]: time=2025-03-29T15:57:04.394-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.107-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8998748160 required="6.2 GiB"
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.107-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="37.5 GiB" free_swap="68.9 GiB"
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.107-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.108-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39715"
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.109-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.109-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.109-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 15:57:05 launchpad ollama[65101]: INFO [main] build info | build=0 commit="unknown" tid="140646590676992" timestamp=1743289025
+Mar 29 15:57:05 launchpad ollama[65101]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140646590676992" timestamp=1743289025 total_threads=16
+Mar 29 15:57:05 launchpad ollama[65101]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39715" tid="140646590676992" timestamp=1743289025
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.359-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 15:57:05 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 15:57:05 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 15:57:05 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 15:57:05 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 15:57:06 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 15:57:06 launchpad ollama[65101]: INFO [main] model loaded | tid="140646590676992" timestamp=1743289026
+Mar 29 15:57:06 launchpad ollama[1510]: time=2025-03-29T15:57:06.363-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 15:57:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:57:09 | 200 |  5.238370692s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:57:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:57:09 | 200 |  403.352434ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:57:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:57:11 | 200 |   1.72354465s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:58:01 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:58:01 | 200 |  2.934241667s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:58:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:58:13 | 200 |     559.124µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 16:00:52 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:00:52 | 200 |      26.432µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 16:01:22 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:01:22 | 200 |      36.682µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 16:02:02 launchpad ollama[1510]: time=2025-03-29T16:02:02.865-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=system
+Mar 29 16:02:04 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:02:04 | 200 |  2.127141193s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:02:19 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:02:19 | 200 |      25.814µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 16:02:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:02:28 | 200 |      25.063µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 16:02:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:02:47 | 200 |  1.347830878s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:02:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:02:47 | 200 |  369.390909ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:02:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:02:49 | 200 |  1.428751454s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:03:08 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:03:08 | 200 |  2.267334677s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:03:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:03:38 | 200 |  1.247778535s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:04:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:04:47 | 200 |  3.313803625s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:05:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:05:31 | 200 |  4.136510182s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:06:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:06:09 | 200 |  4.212121131s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:07:54 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:07:54 | 200 |  5.164935817s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:23:07 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:23:07 | 200 |     628.736µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 16:38:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:38:28 | 200 |     531.066µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.368-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8988655616 required="6.2 GiB"
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.368-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="35.4 GiB" free_swap="68.9 GiB"
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.369-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.370-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38905"
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.372-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.372-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.372-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 16:42:23 launchpad ollama[91493]: INFO [main] build info | build=0 commit="unknown" tid="140000488312832" timestamp=1743291743
+Mar 29 16:42:23 launchpad ollama[91493]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140000488312832" timestamp=1743291743 total_threads=16
+Mar 29 16:42:23 launchpad ollama[91493]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38905" tid="140000488312832" timestamp=1743291743
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.623-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 16:42:23 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 16:42:23 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 16:42:23 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 16:42:23 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 16:42:24 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 16:42:24 launchpad ollama[91493]: INFO [main] model loaded | tid="140000488312832" timestamp=1743291744
+Mar 29 16:42:24 launchpad ollama[1510]: time=2025-03-29T16:42:24.627-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 16:42:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:42:30 | 200 |  7.653093301s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.233-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8867414016 required="6.2 GiB"
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.233-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="36.4 GiB" free_swap="68.9 GiB"
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.233-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.235-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38435"
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.235-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.235-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.235-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 16:51:33 launchpad ollama[94118]: INFO [main] build info | build=0 commit="unknown" tid="139900342849536" timestamp=1743292293
+Mar 29 16:51:33 launchpad ollama[94118]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139900342849536" timestamp=1743292293 total_threads=16
+Mar 29 16:51:33 launchpad ollama[94118]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38435" tid="139900342849536" timestamp=1743292293
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.486-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 16:51:33 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 16:51:33 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 16:51:33 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 16:51:33 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 16:51:34 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 16:51:34 launchpad ollama[94118]: INFO [main] model loaded | tid="139900342849536" timestamp=1743292294
+Mar 29 16:51:34 launchpad ollama[1510]: time=2025-03-29T16:51:34.490-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 16:51:39 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:51:39 | 200 |  6.709006721s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:54:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:54:09 | 200 |     514.283µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 16:56:24 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:56:24 | 200 |     537.924µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 17:00:54 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:00:54 | 200 |     581.225µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 17:06:45 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:06:45 | 200 |     528.457µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 17:12:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:12:03 | 200 |     511.642µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 17:14:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:14:33 | 200 |      36.602µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 17:16:29 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:16:29 | 200 |      26.409µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 17:18:06 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:18:06 | 200 |      26.305µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.278-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9077325824 required="7.7 GiB"
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.278-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="38.1 GiB" free_swap="68.9 GiB"
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.278-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.279-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 41413"
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.279-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.279-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.280-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 17:18:25 launchpad ollama[106886]: INFO [main] build info | build=0 commit="unknown" tid="139720255180800" timestamp=1743293905
+Mar 29 17:18:25 launchpad ollama[106886]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139720255180800" timestamp=1743293905 total_threads=16
+Mar 29 17:18:25 launchpad ollama[106886]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41413" tid="139720255180800" timestamp=1743293905
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.530-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 17:18:25 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 17:18:25 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 17:18:25 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 17:18:25 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 17:18:26 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 17:18:26 launchpad ollama[106886]: INFO [main] model loaded | tid="139720255180800" timestamp=1743293906
+Mar 29 17:18:26 launchpad ollama[1510]: time=2025-03-29T17:18:26.534-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 17:18:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:18:34 | 200 |  9.681979087s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 17:18:35 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:18:35 | 200 |  677.945416ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 17:18:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:18:38 | 200 |  2.856671931s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 17:37:32 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:37:32 | 200 |      28.974µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.715-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8734965760 required="6.2 GiB"
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.715-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.7 GiB" free_swap="68.9 GiB"
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.715-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.716-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45447"
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.717-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.717-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.717-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 17:37:50 launchpad ollama[113882]: INFO [main] build info | build=0 commit="unknown" tid="140641303601152" timestamp=1743295070
+Mar 29 17:37:50 launchpad ollama[113882]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140641303601152" timestamp=1743295070 total_threads=16
+Mar 29 17:37:50 launchpad ollama[113882]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45447" tid="140641303601152" timestamp=1743295070
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.968-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 17:37:50 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 17:37:51 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 17:37:51 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 17:37:51 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 17:37:51 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 17:37:51 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 17:37:51 launchpad ollama[113882]: INFO [main] model loaded | tid="140641303601152" timestamp=1743295071
+Mar 29 17:37:51 launchpad ollama[1510]: time=2025-03-29T17:37:51.971-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 17:37:57 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:37:57 | 200 |  6.687304313s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 17:43:55 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:43:55 | 200 |     698.996µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 17:58:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:58:46 | 200 |     598.295µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 17:59:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:59:28 | 200 |      32.202µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.440-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8898215936 required="6.2 GiB"
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.440-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="31.4 GiB" free_swap="68.9 GiB"
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.441-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.442-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45883"
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.442-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.442-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.442-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 17:59:32 launchpad ollama[122308]: INFO [main] build info | build=0 commit="unknown" tid="140180380872704" timestamp=1743296372
+Mar 29 17:59:32 launchpad ollama[122308]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140180380872704" timestamp=1743296372 total_threads=16
+Mar 29 17:59:32 launchpad ollama[122308]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45883" tid="140180380872704" timestamp=1743296372
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.693-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 17:59:32 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 17:59:32 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 17:59:32 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 17:59:32 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 17:59:33 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 17:59:33 launchpad ollama[122308]: INFO [main] model loaded | tid="140180380872704" timestamp=1743296373
+Mar 29 17:59:33 launchpad ollama[1510]: time=2025-03-29T17:59:33.696-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 17:59:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:59:36 | 200 |  3.966688616s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 17:59:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:59:36 | 200 |  377.108442ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 17:59:39 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:59:39 | 200 |  2.943474695s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:03:08 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:03:08 | 200 |  5.278321885s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:06:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:06:12 | 200 |  5.060123665s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:07:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:07:28 | 200 |     964.908µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 18:07:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:07:33 | 200 |      27.917µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 18:07:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:07:49 | 200 |      27.009µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 18:09:47 launchpad ollama[1510]: time=2025-03-29T18:09:47.378-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.5 GiB"
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.080-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8905359360 required="6.5 GiB"
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.080-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="31.4 GiB" free_swap="68.9 GiB"
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.080-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.082-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46311"
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.082-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.082-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.082-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 18:09:48 launchpad ollama[125078]: INFO [main] build info | build=0 commit="unknown" tid="140020102225920" timestamp=1743296988
+Mar 29 18:09:48 launchpad ollama[125078]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140020102225920" timestamp=1743296988 total_threads=16
+Mar 29 18:09:48 launchpad ollama[125078]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46311" tid="140020102225920" timestamp=1743296988
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.333-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 18:09:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 18:09:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 18:09:48 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 18:09:48 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 18:09:49 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 18:09:49 launchpad ollama[125078]: INFO [main] model loaded | tid="140020102225920" timestamp=1743296989
+Mar 29 18:09:49 launchpad ollama[1510]: time=2025-03-29T18:09:49.337-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 18:10:02 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:10:02 | 200 | 15.523117655s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:10:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:10:03 | 200 |  1.017832846s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:10:05 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:10:05 | 200 |  1.236342831s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:13:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:13:28 | 200 |      28.056µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 18:13:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:13:49 | 200 |      25.423µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 18:16:48 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:16:48 | 200 |      28.128µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 18:21:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:21:11 | 200 |      29.659µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.851-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8932360192 required="6.2 GiB"
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.851-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="31.2 GiB" free_swap="68.9 GiB"
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.851-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.852-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38689"
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.853-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.853-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.853-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 18:22:33 launchpad ollama[129081]: INFO [main] build info | build=0 commit="unknown" tid="140086025785344" timestamp=1743297753
+Mar 29 18:22:33 launchpad ollama[129081]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140086025785344" timestamp=1743297753 total_threads=16
+Mar 29 18:22:33 launchpad ollama[129081]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38689" tid="140086025785344" timestamp=1743297753
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 18:22:34 launchpad ollama[1510]: time=2025-03-29T18:22:34.104-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 18:22:34 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 18:22:34 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 18:22:34 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 18:22:34 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 18:22:34 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 18:22:34 launchpad ollama[129081]: INFO [main] model loaded | tid="140086025785344" timestamp=1743297754
+Mar 29 18:22:35 launchpad ollama[1510]: time=2025-03-29T18:22:35.108-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 18:22:35 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:22:35 | 200 |  1.629465188s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 18:22:35 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:22:35 | 200 |  188.390586ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 18:22:35 launchpad ollama[1510]: time=2025-03-29T18:22:35.689-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.373-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8932491264 required="6.5 GiB"
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.373-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="31.1 GiB" free_swap="68.9 GiB"
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.373-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.374-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44743"
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.374-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.374-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.374-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 18:22:36 launchpad ollama[129113]: INFO [main] build info | build=0 commit="unknown" tid="140247785431040" timestamp=1743297756
+Mar 29 18:22:36 launchpad ollama[129113]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140247785431040" timestamp=1743297756 total_threads=16
+Mar 29 18:22:36 launchpad ollama[129113]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44743" tid="140247785431040" timestamp=1743297756
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.625-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 18:22:36 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 18:22:36 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 18:22:36 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 18:22:36 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 18:22:37 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 18:22:37 launchpad ollama[129113]: INFO [main] model loaded | tid="140247785431040" timestamp=1743297757
+Mar 29 18:22:37 launchpad ollama[1510]: time=2025-03-29T18:22:37.628-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 18:22:50 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:22:50 | 200 | 14.653973203s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:22:51 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:22:51 | 200 |  1.664478257s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:22:53 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:22:53 | 200 |  1.216371711s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:23:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:23:59 | 200 |     612.232µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 18:25:48 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:25:48 | 200 |     518.057µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 18:38:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:38:59 | 200 |     1.95315ms |       127.0.0.1 | GET      "/api/tags"
+Mar 29 19:03:01 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:03:01 | 200 |     710.615µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 19:14:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:14:13 | 200 |     571.773µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 19:14:41 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:14:41 | 200 |      32.573µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.033-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8947302400 required="6.2 GiB"
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.033-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.033-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.035-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42605"
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.035-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.035-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.035-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:15:39 launchpad ollama[144822]: INFO [main] build info | build=0 commit="unknown" tid="139766912868352" timestamp=1743300939
+Mar 29 19:15:39 launchpad ollama[144822]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139766912868352" timestamp=1743300939 total_threads=16
+Mar 29 19:15:39 launchpad ollama[144822]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42605" tid="139766912868352" timestamp=1743300939
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.286-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:15:39 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:15:39 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:15:39 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:15:39 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:15:40 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:15:40 launchpad ollama[144822]: INFO [main] model loaded | tid="139766912868352" timestamp=1743300940
+Mar 29 19:15:40 launchpad ollama[1510]: time=2025-03-29T19:15:40.290-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:15:41 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:15:41 | 200 |  3.014868197s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:16:06 launchpad ollama[1510]: time=2025-03-29T19:16:06.400-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.123-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8947040256 required="6.5 GiB"
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.123-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.123-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.124-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33891"
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.124-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.124-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.124-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:16:07 launchpad ollama[144939]: INFO [main] build info | build=0 commit="unknown" tid="140273985212416" timestamp=1743300967
+Mar 29 19:16:07 launchpad ollama[144939]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140273985212416" timestamp=1743300967 total_threads=16
+Mar 29 19:16:07 launchpad ollama[144939]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33891" tid="140273985212416" timestamp=1743300967
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.376-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:16:07 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:16:07 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:16:07 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:16:07 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:16:09 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:16:09 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:16:09 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:16:09 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:16:10 launchpad ollama[144939]: INFO [main] model loaded | tid="140273985212416" timestamp=1743300970
+Mar 29 19:16:10 launchpad ollama[1510]: time=2025-03-29T19:16:10.133-07:00 level=INFO source=server.go:626 msg="llama runner started in 3.01 seconds"
+Mar 29 19:16:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:17 | 200 |  11.65726168s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:16:18 launchpad ollama[1510]: time=2025-03-29T19:16:18.965-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.682-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8947302400 required="6.2 GiB"
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.682-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.7 GiB" free_swap="68.9 GiB"
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.682-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.683-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34981"
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.684-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.684-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.684-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:16:19 launchpad ollama[145026]: INFO [main] build info | build=0 commit="unknown" tid="140137406337024" timestamp=1743300979
+Mar 29 19:16:19 launchpad ollama[145026]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140137406337024" timestamp=1743300979 total_threads=16
+Mar 29 19:16:19 launchpad ollama[145026]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34981" tid="140137406337024" timestamp=1743300979
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.935-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:16:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:16:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:16:19 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:16:19 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:16:20 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:16:20 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:16:20 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:16:20 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:16:20 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:16:20 launchpad ollama[145026]: INFO [main] model loaded | tid="140137406337024" timestamp=1743300980
+Mar 29 19:16:20 launchpad ollama[1510]: time=2025-03-29T19:16:20.938-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:16:21 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:21 | 200 |  2.304047755s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:16:25 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:25 | 200 |   164.46829ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:16:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:26 | 200 |  1.042665096s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:16:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:26 | 200 |   82.974107ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:16:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:26 | 200 |  164.100542ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:16:26 launchpad ollama[1510]: time=2025-03-29T19:16:26.768-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.474-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8947302400 required="6.5 GiB"
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.474-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.5 GiB" free_swap="68.9 GiB"
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.475-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.475-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42745"
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.476-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.476-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.476-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:16:27 launchpad ollama[145107]: INFO [main] build info | build=0 commit="unknown" tid="139800153673728" timestamp=1743300987
+Mar 29 19:16:27 launchpad ollama[145107]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139800153673728" timestamp=1743300987 total_threads=16
+Mar 29 19:16:27 launchpad ollama[145107]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42745" tid="139800153673728" timestamp=1743300987
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.727-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:16:27 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:16:27 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:16:27 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:16:27 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:16:28 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:16:28 launchpad ollama[145107]: INFO [main] model loaded | tid="139800153673728" timestamp=1743300988
+Mar 29 19:16:28 launchpad ollama[1510]: time=2025-03-29T19:16:28.731-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:16:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:36 | 200 |  9.842079814s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:18:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:09 | 200 |  1.638019863s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:18:10 launchpad ollama[1510]: time=2025-03-29T19:18:10.324-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.027-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8947302400 required="6.2 GiB"
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.027-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.3 GiB" free_swap="68.9 GiB"
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.027-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.028-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39675"
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.029-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.029-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.029-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:18:11 launchpad ollama[145577]: INFO [main] build info | build=0 commit="unknown" tid="140691987349504" timestamp=1743301091
+Mar 29 19:18:11 launchpad ollama[145577]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140691987349504" timestamp=1743301091 total_threads=16
+Mar 29 19:18:11 launchpad ollama[145577]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39675" tid="140691987349504" timestamp=1743301091
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.280-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:18:11 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:18:11 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:18:11 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:18:11 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:18:11 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:18:11 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:18:11 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:18:11 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:18:11 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:18:12 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:18:12 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:18:12 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:18:12 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:18:12 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:18:12 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:18:12 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:18:12 launchpad ollama[145577]: INFO [main] model loaded | tid="140691987349504" timestamp=1743301092
+Mar 29 19:18:12 launchpad ollama[1510]: time=2025-03-29T19:18:12.284-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:18:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:12 | 200 |  2.342627575s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:18:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:12 | 200 |  165.867903ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:18:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:13 | 200 |  607.373291ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:18:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:13 | 200 |  166.575372ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:18:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:13 | 200 |  167.455848ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:18:13 launchpad ollama[1510]: time=2025-03-29T19:18:13.801-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.524-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8947302400 required="6.5 GiB"
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.524-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.524-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.525-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41827"
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.525-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.525-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.526-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:18:14 launchpad ollama[145623]: INFO [main] build info | build=0 commit="unknown" tid="139784904335360" timestamp=1743301094
+Mar 29 19:18:14 launchpad ollama[145623]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139784904335360" timestamp=1743301094 total_threads=16
+Mar 29 19:18:14 launchpad ollama[145623]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41827" tid="139784904335360" timestamp=1743301094
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.777-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:18:14 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:18:14 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:18:14 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:18:14 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:18:15 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:18:15 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:18:15 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:18:15 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:18:15 launchpad ollama[145623]: INFO [main] model loaded | tid="139784904335360" timestamp=1743301095
+Mar 29 19:18:15 launchpad ollama[1510]: time=2025-03-29T19:18:15.781-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:18:16 launchpad ollama[145623]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1023 n_keep=4 n_left=2044 n_shift=1022 tid="139784904335360" timestamp=1743301096
+Mar 29 19:18:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:33 | 200 | 19.739859237s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:21:05 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:05 | 200 |  7.414607705s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:21:15 launchpad ollama[1510]: time=2025-03-29T19:21:15.361-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.096-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8992129024 required="6.2 GiB"
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.096-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.096-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.097-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43935"
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.097-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.097-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.097-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:21:16 launchpad ollama[146421]: INFO [main] build info | build=0 commit="unknown" tid="140224637255680" timestamp=1743301276
+Mar 29 19:21:16 launchpad ollama[146421]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140224637255680" timestamp=1743301276 total_threads=16
+Mar 29 19:21:16 launchpad ollama[146421]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43935" tid="140224637255680" timestamp=1743301276
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.349-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:21:16 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:21:16 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:21:16 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:21:16 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:21:17 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:21:17 launchpad ollama[146421]: INFO [main] model loaded | tid="140224637255680" timestamp=1743301277
+Mar 29 19:21:17 launchpad ollama[1510]: time=2025-03-29T19:21:17.352-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:21:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:17 | 200 |  2.345376212s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:21:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:17 | 200 |  178.181848ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:21:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:18 | 200 |  953.155192ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:21:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:18 | 200 |   95.123191ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:21:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:18 | 200 |  176.692628ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.184-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.885-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8992129024 required="6.5 GiB"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.885-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.885-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.886-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44869"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.886-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.886-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.886-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:21:19 launchpad ollama[146455]: INFO [main] build info | build=0 commit="unknown" tid="140033613987840" timestamp=1743301279
+Mar 29 19:21:19 launchpad ollama[146455]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140033613987840" timestamp=1743301279 total_threads=16
+Mar 29 19:21:19 launchpad ollama[146455]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44869" tid="140033613987840" timestamp=1743301279
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:21:20 launchpad ollama[1510]: time=2025-03-29T19:21:20.138-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:21:20 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:21:20 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:21:20 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:21:20 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:21:20 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:21:20 launchpad ollama[146455]: INFO [main] model loaded | tid="140033613987840" timestamp=1743301280
+Mar 29 19:21:21 launchpad ollama[1510]: time=2025-03-29T19:21:21.141-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:21:39 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:39 | 200 | 20.608513117s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:24:32 launchpad ollama[146455]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3009 n_keep=4 n_left=2044 n_shift=1022 tid="140033613987840" timestamp=1743301472
+Mar 29 19:24:57 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:24:57 | 200 | 25.210920696s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:24:57 launchpad ollama[1510]: time=2025-03-29T19:24:57.719-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.485-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8992129024 required="6.2 GiB"
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.485-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.5 GiB" free_swap="68.9 GiB"
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.485-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.486-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33181"
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.486-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.486-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.486-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:24:58 launchpad ollama[147404]: INFO [main] build info | build=0 commit="unknown" tid="140250669076480" timestamp=1743301498
+Mar 29 19:24:58 launchpad ollama[147404]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140250669076480" timestamp=1743301498 total_threads=16
+Mar 29 19:24:58 launchpad ollama[147404]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33181" tid="140250669076480" timestamp=1743301498
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.738-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:24:58 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:24:58 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:24:58 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:24:58 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:24:59 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:24:59 launchpad ollama[147404]: INFO [main] model loaded | tid="140250669076480" timestamp=1743301499
+Mar 29 19:24:59 launchpad ollama[1510]: time=2025-03-29T19:24:59.742-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:25:00 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:25:00 | 200 |  3.173991021s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:25:01 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:25:01 | 200 |  982.464629ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:25:02 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:25:02 | 200 |  870.407053ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:25:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:25:03 | 200 |  941.090479ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:25:04 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:25:04 | 200 |  981.459742ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:25:04 launchpad ollama[1510]: time=2025-03-29T19:25:04.762-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.468-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8992129024 required="6.5 GiB"
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.469-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.5 GiB" free_swap="68.9 GiB"
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.469-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.470-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44611"
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.470-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.470-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.470-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:25:05 launchpad ollama[147480]: INFO [main] build info | build=0 commit="unknown" tid="140124223516672" timestamp=1743301505
+Mar 29 19:25:05 launchpad ollama[147480]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140124223516672" timestamp=1743301505 total_threads=16
+Mar 29 19:25:05 launchpad ollama[147480]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44611" tid="140124223516672" timestamp=1743301505
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.721-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:25:05 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:25:05 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:25:05 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:25:05 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:25:06 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:25:06 launchpad ollama[147480]: INFO [main] model loaded | tid="140124223516672" timestamp=1743301506
+Mar 29 19:25:06 launchpad ollama[1510]: time=2025-03-29T19:25:06.724-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:25:25 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:25:25 | 200 | 20.884409081s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:26:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:26:13 | 200 |      24.915µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.180-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.899-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.2 GiB"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.899-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.5 GiB" free_swap="68.9 GiB"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.899-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.900-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39859"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.900-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.900-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.900-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:27:44 launchpad ollama[148205]: INFO [main] build info | build=0 commit="unknown" tid="140328937287680" timestamp=1743301664
+Mar 29 19:27:44 launchpad ollama[148205]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140328937287680" timestamp=1743301664 total_threads=16
+Mar 29 19:27:44 launchpad ollama[148205]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39859" tid="140328937287680" timestamp=1743301664
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:27:45 launchpad ollama[1510]: time=2025-03-29T19:27:45.152-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:27:45 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:27:45 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:27:45 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:27:45 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:27:45 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:27:45 launchpad ollama[148205]: INFO [main] model loaded | tid="140328937287680" timestamp=1743301665
+Mar 29 19:27:46 launchpad ollama[1510]: time=2025-03-29T19:27:46.156-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:27:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:27:46 | 200 |  2.352563192s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:27:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:27:46 | 200 |  133.497156ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:27:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:27:47 | 200 |  848.001811ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:27:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:27:47 | 200 |   90.798322ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:27:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:27:47 | 200 |  174.689794ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:27:47 launchpad ollama[1510]: time=2025-03-29T19:27:47.829-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.548-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.5 GiB"
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.548-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.5 GiB" free_swap="68.9 GiB"
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.548-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.549-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38387"
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.549-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.549-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.549-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:27:48 launchpad ollama[148239]: INFO [main] build info | build=0 commit="unknown" tid="140639076442112" timestamp=1743301668
+Mar 29 19:27:48 launchpad ollama[148239]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140639076442112" timestamp=1743301668 total_threads=16
+Mar 29 19:27:48 launchpad ollama[148239]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38387" tid="140639076442112" timestamp=1743301668
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.800-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:27:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:27:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:27:48 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:27:48 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:27:49 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:27:49 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:27:49 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:27:49 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:27:49 launchpad ollama[148239]: INFO [main] model loaded | tid="140639076442112" timestamp=1743301669
+Mar 29 19:27:49 launchpad ollama[1510]: time=2025-03-29T19:27:49.802-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:28:24 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:28:24 | 200 | 36.600397079s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:29:47 launchpad ollama[148239]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1602 n_keep=4 n_left=2044 n_shift=1022 tid="140639076442112" timestamp=1743301787
+Mar 29 19:30:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:14 | 200 | 26.665771335s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:30:14 launchpad ollama[1510]: time=2025-03-29T19:30:14.398-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.2 GiB"
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.111-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.2 GiB"
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.111-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.5 GiB" free_swap="68.9 GiB"
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.112-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.113-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44021"
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.113-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.113-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.113-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:30:15 launchpad ollama[148901]: INFO [main] build info | build=0 commit="unknown" tid="140053578866688" timestamp=1743301815
+Mar 29 19:30:15 launchpad ollama[148901]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140053578866688" timestamp=1743301815 total_threads=16
+Mar 29 19:30:15 launchpad ollama[148901]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44021" tid="140053578866688" timestamp=1743301815
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.364-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:30:15 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:30:15 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:30:15 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:30:15 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:30:16 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:30:16 launchpad ollama[148901]: INFO [main] model loaded | tid="140053578866688" timestamp=1743301816
+Mar 29 19:30:16 launchpad ollama[1510]: time=2025-03-29T19:30:16.368-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:30:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:16 | 200 |  2.303012071s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:30:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:16 | 200 |   190.64113ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:30:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:17 | 200 |  933.720343ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:30:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:17 | 200 |  107.107543ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:30:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:17 | 200 |  188.381827ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.189-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.895-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8957394944 required="6.5 GiB"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.895-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.895-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.896-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35763"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.896-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.896-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.897-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:30:18 launchpad ollama[148937]: INFO [main] build info | build=0 commit="unknown" tid="140544602787840" timestamp=1743301818
+Mar 29 19:30:18 launchpad ollama[148937]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140544602787840" timestamp=1743301818 total_threads=16
+Mar 29 19:30:18 launchpad ollama[148937]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35763" tid="140544602787840" timestamp=1743301818
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:30:19 launchpad ollama[1510]: time=2025-03-29T19:30:19.148-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:30:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:30:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:30:19 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:30:19 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:30:19 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:30:20 launchpad ollama[148937]: INFO [main] model loaded | tid="140544602787840" timestamp=1743301820
+Mar 29 19:30:20 launchpad ollama[1510]: time=2025-03-29T19:30:20.152-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:30:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:34 | 200 | 16.909366725s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:30:44 launchpad ollama[148937]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1384 n_keep=4 n_left=2044 n_shift=1022 tid="140544602787840" timestamp=1743301844
+Mar 29 19:31:08 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:08 | 200 |  25.01725736s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.073-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.778-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8957132800 required="6.2 GiB"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.778-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.778-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.779-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36695"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.779-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.779-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.780-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:31:09 launchpad ollama[149206]: INFO [main] build info | build=0 commit="unknown" tid="140110638301184" timestamp=1743301869
+Mar 29 19:31:09 launchpad ollama[149206]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140110638301184" timestamp=1743301869 total_threads=16
+Mar 29 19:31:09 launchpad ollama[149206]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36695" tid="140110638301184" timestamp=1743301869
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:31:10 launchpad ollama[1510]: time=2025-03-29T19:31:10.031-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:31:10 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:31:10 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:31:10 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:31:10 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:31:10 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:31:10 launchpad ollama[149206]: INFO [main] model loaded | tid="140110638301184" timestamp=1743301870
+Mar 29 19:31:10 launchpad ollama[1510]: time=2025-03-29T19:31:10.785-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 29 19:31:10 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:10 | 200 |  2.029661009s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:31:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:11 | 200 |  184.019162ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:31:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:12 | 200 |  948.727948ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:31:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:12 | 200 |    102.5928ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:31:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:12 | 200 |  185.694818ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:31:12 launchpad ollama[1510]: time=2025-03-29T19:31:12.597-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.302-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8957132800 required="6.5 GiB"
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.302-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.302-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.303-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36083"
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.303-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.303-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.304-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:31:13 launchpad ollama[149248]: INFO [main] build info | build=0 commit="unknown" tid="140352045539328" timestamp=1743301873
+Mar 29 19:31:13 launchpad ollama[149248]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140352045539328" timestamp=1743301873 total_threads=16
+Mar 29 19:31:13 launchpad ollama[149248]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36083" tid="140352045539328" timestamp=1743301873
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.554-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:31:13 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:31:13 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:31:13 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:31:13 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:31:14 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:31:14 launchpad ollama[149248]: INFO [main] model loaded | tid="140352045539328" timestamp=1743301874
+Mar 29 19:31:14 launchpad ollama[1510]: time=2025-03-29T19:31:14.558-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:31:21 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:21 | 200 |  9.064561113s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:31:55 launchpad ollama[149248]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1649 n_keep=4 n_left=2044 n_shift=1022 tid="140352045539328" timestamp=1743301915
+Mar 29 19:32:08 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:08 | 200 | 12.886429699s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:32:08 launchpad ollama[1510]: time=2025-03-29T19:32:08.790-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.524-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.2 GiB"
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.524-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.524-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.525-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40947"
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.525-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.525-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.525-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:32:09 launchpad ollama[149506]: INFO [main] build info | build=0 commit="unknown" tid="140410352848896" timestamp=1743301929
+Mar 29 19:32:09 launchpad ollama[149506]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140410352848896" timestamp=1743301929 total_threads=16
+Mar 29 19:32:09 launchpad ollama[149506]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40947" tid="140410352848896" timestamp=1743301929
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.776-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:32:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:32:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:32:09 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:32:09 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:32:10 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:32:10 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:32:10 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:32:10 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:32:10 launchpad ollama[149506]: INFO [main] model loaded | tid="140410352848896" timestamp=1743301930
+Mar 29 19:32:10 launchpad ollama[1510]: time=2025-03-29T19:32:10.780-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:32:10 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:10 | 200 |  2.331624054s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:32:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:11 | 200 |  168.753064ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:32:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:12 | 200 |   1.03890153s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:32:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:12 | 200 |  127.578393ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:32:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:12 | 200 |  166.203922ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:32:12 launchpad ollama[1510]: time=2025-03-29T19:32:12.705-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.404-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.5 GiB"
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.404-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.404-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.405-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43015"
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.406-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.406-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.406-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:32:13 launchpad ollama[149577]: INFO [main] build info | build=0 commit="unknown" tid="140671766847488" timestamp=1743301933
+Mar 29 19:32:13 launchpad ollama[149577]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140671766847488" timestamp=1743301933 total_threads=16
+Mar 29 19:32:13 launchpad ollama[149577]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43015" tid="140671766847488" timestamp=1743301933
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.657-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:32:13 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:32:13 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:32:13 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:32:13 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:32:14 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:32:14 launchpad ollama[149577]: INFO [main] model loaded | tid="140671766847488" timestamp=1743301934
+Mar 29 19:32:14 launchpad ollama[1510]: time=2025-03-29T19:32:14.661-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:32:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:43 | 200 | 30.733695259s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:32:57 launchpad ollama[1510]: time=2025-03-29T19:32:57.617-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.336-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.2 GiB"
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.336-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.336-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.337-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46135"
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.337-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.338-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.338-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:32:58 launchpad ollama[149801]: INFO [main] build info | build=0 commit="unknown" tid="140039007854592" timestamp=1743301978
+Mar 29 19:32:58 launchpad ollama[149801]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140039007854592" timestamp=1743301978 total_threads=16
+Mar 29 19:32:58 launchpad ollama[149801]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46135" tid="140039007854592" timestamp=1743301978
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.589-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:32:58 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:32:58 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:32:58 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:32:58 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:32:59 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:32:59 launchpad ollama[149801]: INFO [main] model loaded | tid="140039007854592" timestamp=1743301979
+Mar 29 19:32:59 launchpad ollama[1510]: time=2025-03-29T19:32:59.593-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:33:00 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:00 | 200 |   3.41028149s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:15 launchpad ollama[1510]: time=2025-03-29T19:33:15.939-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.647-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.5 GiB"
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.647-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.647-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.648-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33563"
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.648-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.648-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.648-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:33:16 launchpad ollama[149900]: INFO [main] build info | build=0 commit="unknown" tid="140138493497344" timestamp=1743301996
+Mar 29 19:33:16 launchpad ollama[149900]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140138493497344" timestamp=1743301996 total_threads=16
+Mar 29 19:33:16 launchpad ollama[149900]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33563" tid="140138493497344" timestamp=1743301996
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.899-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:33:16 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:33:16 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:33:16 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:33:16 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:33:17 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:33:17 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:33:17 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:33:17 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:33:17 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:33:17 launchpad ollama[149900]: INFO [main] model loaded | tid="140138493497344" timestamp=1743301997
+Mar 29 19:33:17 launchpad ollama[1510]: time=2025-03-29T19:33:17.902-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:33:17 launchpad ollama[149900]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1287 n_keep=4 n_left=2044 n_shift=1022 tid="140138493497344" timestamp=1743301997
+Mar 29 19:33:29 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:29 | 200 | 13.579537466s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:33:29 launchpad ollama[1510]: time=2025-03-29T19:33:29.505-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.206-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.2 GiB"
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.206-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.206-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.207-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39367"
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.207-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.208-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.208-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:33:30 launchpad ollama[149990]: INFO [main] build info | build=0 commit="unknown" tid="140336398630912" timestamp=1743302010
+Mar 29 19:33:30 launchpad ollama[149990]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140336398630912" timestamp=1743302010 total_threads=16
+Mar 29 19:33:30 launchpad ollama[149990]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39367" tid="140336398630912" timestamp=1743302010
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.459-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:33:30 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:33:30 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:33:30 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:33:30 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:33:31 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:33:31 launchpad ollama[149990]: INFO [main] model loaded | tid="140336398630912" timestamp=1743302011
+Mar 29 19:33:31 launchpad ollama[1510]: time=2025-03-29T19:33:31.462-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:33:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:31 | 200 |  2.283415193s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:31 | 200 |  155.392816ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:32 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:32 | 200 |  853.282051ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:32 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:32 | 200 |   74.143485ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:32 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:32 | 200 |  113.733131ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:33 | 200 |  1.134762366s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:34 | 200 |   73.051234ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:34 | 200 |  157.075762ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:34 launchpad ollama[1510]: time=2025-03-29T19:33:34.458-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.183-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.5 GiB"
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.183-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.183-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.184-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33705"
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.184-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.184-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.184-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:33:35 launchpad ollama[150028]: INFO [main] build info | build=0 commit="unknown" tid="140655059640320" timestamp=1743302015
+Mar 29 19:33:35 launchpad ollama[150028]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140655059640320" timestamp=1743302015 total_threads=16
+Mar 29 19:33:35 launchpad ollama[150028]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33705" tid="140655059640320" timestamp=1743302015
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.435-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:33:35 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:33:35 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:33:35 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:33:35 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:33:36 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:33:36 launchpad ollama[150028]: INFO [main] model loaded | tid="140655059640320" timestamp=1743302016
+Mar 29 19:33:36 launchpad ollama[1510]: time=2025-03-29T19:33:36.438-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:34:06 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:34:06 | 200 | 32.315525661s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:35:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:35:30 | 200 |     575.985µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 19:35:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:35:34 | 200 |       24.44µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 19:36:02 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:36:02 | 200 |   5.89327997s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:36:13 launchpad ollama[1510]: time=2025-03-29T19:36:13.416-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.134-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8932032512 required="6.2 GiB"
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.134-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.134-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.135-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 32949"
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.136-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.136-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.136-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:36:14 launchpad ollama[150737]: INFO [main] build info | build=0 commit="unknown" tid="139902623862784" timestamp=1743302174
+Mar 29 19:36:14 launchpad ollama[150737]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139902623862784" timestamp=1743302174 total_threads=16
+Mar 29 19:36:14 launchpad ollama[150737]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32949" tid="139902623862784" timestamp=1743302174
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.386-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:36:14 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:36:14 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:36:14 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:36:14 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:36:15 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:36:15 launchpad ollama[150737]: INFO [main] model loaded | tid="139902623862784" timestamp=1743302175
+Mar 29 19:36:15 launchpad ollama[1510]: time=2025-03-29T19:36:15.390-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:36:15 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:36:15 | 200 |   2.31917171s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:36:15 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:36:15 | 200 |    129.7234ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:36:15 launchpad ollama[1510]: time=2025-03-29T19:36:15.896-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.588-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8932032512 required="6.5 GiB"
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.588-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.589-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.589-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33651"
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.590-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.590-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.590-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:36:16 launchpad ollama[150780]: INFO [main] build info | build=0 commit="unknown" tid="140212018692096" timestamp=1743302176
+Mar 29 19:36:16 launchpad ollama[150780]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140212018692096" timestamp=1743302176 total_threads=16
+Mar 29 19:36:16 launchpad ollama[150780]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33651" tid="140212018692096" timestamp=1743302176
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.840-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:36:16 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:36:16 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:36:16 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:36:16 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:36:17 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:36:17 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:36:17 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:36:17 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:36:17 launchpad ollama[150780]: INFO [main] model loaded | tid="140212018692096" timestamp=1743302177
+Mar 29 19:36:17 launchpad ollama[1510]: time=2025-03-29T19:36:17.845-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:36:37 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:36:37 | 200 | 21.335777997s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:36:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:36:38 | 200 |  1.279943052s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:36:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:36:40 | 200 |  1.732500632s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:38:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:38:20 | 200 | 12.653378378s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:38:20 launchpad ollama[1510]: time=2025-03-29T19:38:20.960-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.2 GiB"
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.662-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8945991680 required="6.2 GiB"
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.662-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.662-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.663-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42663"
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.663-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.663-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.663-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:38:21 launchpad ollama[151356]: INFO [main] build info | build=0 commit="unknown" tid="139869312413696" timestamp=1743302301
+Mar 29 19:38:21 launchpad ollama[151356]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139869312413696" timestamp=1743302301 total_threads=16
+Mar 29 19:38:21 launchpad ollama[151356]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42663" tid="139869312413696" timestamp=1743302301
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.914-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:38:21 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:38:21 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:38:21 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:38:21 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:38:22 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:38:22 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:38:22 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:38:22 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:38:22 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:38:22 launchpad ollama[151356]: INFO [main] model loaded | tid="139869312413696" timestamp=1743302302
+Mar 29 19:38:22 launchpad ollama[1510]: time=2025-03-29T19:38:22.918-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:38:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:38:23 | 200 |  2.305049387s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:38:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:38:23 | 200 |  176.483754ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:38:23 launchpad ollama[1510]: time=2025-03-29T19:38:23.502-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.204-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8945991680 required="6.5 GiB"
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.204-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.205-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.206-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40331"
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.206-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.206-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.206-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:38:24 launchpad ollama[151388]: INFO [main] build info | build=0 commit="unknown" tid="140402633170944" timestamp=1743302304
+Mar 29 19:38:24 launchpad ollama[151388]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140402633170944" timestamp=1743302304 total_threads=16
+Mar 29 19:38:24 launchpad ollama[151388]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40331" tid="140402633170944" timestamp=1743302304
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.458-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:38:24 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:38:24 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:38:24 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:38:24 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:38:25 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:38:25 launchpad ollama[151388]: INFO [main] model loaded | tid="140402633170944" timestamp=1743302305
+Mar 29 19:38:25 launchpad ollama[1510]: time=2025-03-29T19:38:25.462-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:38:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:38:40 | 200 | 17.534730037s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:40:25 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:40:25 | 200 | 13.699120848s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:40:25 launchpad ollama[1510]: time=2025-03-29T19:40:25.395-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.118-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8943894528 required="6.2 GiB"
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.118-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.118-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.119-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41631"
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.120-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.120-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.120-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:40:26 launchpad ollama[152028]: INFO [main] build info | build=0 commit="unknown" tid="139980962881536" timestamp=1743302426
+Mar 29 19:40:26 launchpad ollama[152028]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139980962881536" timestamp=1743302426 total_threads=16
+Mar 29 19:40:26 launchpad ollama[152028]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41631" tid="139980962881536" timestamp=1743302426
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.371-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:40:26 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:40:26 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:40:26 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:40:26 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:40:27 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:40:27 launchpad ollama[152028]: INFO [main] model loaded | tid="139980962881536" timestamp=1743302427
+Mar 29 19:40:27 launchpad ollama[1510]: time=2025-03-29T19:40:27.374-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:40:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:40:27 | 200 |  2.321062718s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:40:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:40:27 | 200 |  171.984812ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:40:27 launchpad ollama[1510]: time=2025-03-29T19:40:27.946-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.642-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8943894528 required="6.5 GiB"
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.642-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.642-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.643-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34229"
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.644-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.644-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.644-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:40:28 launchpad ollama[152060]: INFO [main] build info | build=0 commit="unknown" tid="140371490562048" timestamp=1743302428
+Mar 29 19:40:28 launchpad ollama[152060]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140371490562048" timestamp=1743302428 total_threads=16
+Mar 29 19:40:28 launchpad ollama[152060]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34229" tid="140371490562048" timestamp=1743302428
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.895-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:40:28 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:40:28 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:40:28 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:40:28 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:40:29 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:40:29 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:40:29 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:40:29 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:40:29 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:40:29 launchpad ollama[152060]: INFO [main] model loaded | tid="140371490562048" timestamp=1743302429
+Mar 29 19:40:29 launchpad ollama[1510]: time=2025-03-29T19:40:29.898-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:40:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:40:49 | 200 | 22.011537931s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:42:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:42:18 | 200 | 21.984149732s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.225-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.927-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8960671744 required="6.2 GiB"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.927-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.928-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.928-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39601"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.929-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.929-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.929-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:42:18 launchpad ollama[152547]: INFO [main] build info | build=0 commit="unknown" tid="140265974153216" timestamp=1743302538
+Mar 29 19:42:18 launchpad ollama[152547]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140265974153216" timestamp=1743302538 total_threads=16
+Mar 29 19:42:18 launchpad ollama[152547]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39601" tid="140265974153216" timestamp=1743302538
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:42:19 launchpad ollama[1510]: time=2025-03-29T19:42:19.180-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:42:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:42:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:42:19 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:42:19 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:42:19 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:42:19 launchpad ollama[152547]: INFO [main] model loaded | tid="140265974153216" timestamp=1743302539
+Mar 29 19:42:20 launchpad ollama[1510]: time=2025-03-29T19:42:20.184-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:42:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:42:20 | 200 |  2.303696929s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:42:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:42:20 | 200 |  171.320282ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:42:20 launchpad ollama[1510]: time=2025-03-29T19:42:20.734-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.432-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8960671744 required="6.5 GiB"
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.432-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.433-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.434-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34841"
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.434-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.434-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.434-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:42:21 launchpad ollama[152614]: INFO [main] build info | build=0 commit="unknown" tid="140171713527808" timestamp=1743302541
+Mar 29 19:42:21 launchpad ollama[152614]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140171713527808" timestamp=1743302541 total_threads=16
+Mar 29 19:42:21 launchpad ollama[152614]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34841" tid="140171713527808" timestamp=1743302541
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.685-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:42:21 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:42:21 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:42:21 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:42:21 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:42:22 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:42:22 launchpad ollama[152614]: INFO [main] model loaded | tid="140171713527808" timestamp=1743302542
+Mar 29 19:42:22 launchpad ollama[1510]: time=2025-03-29T19:42:22.688-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:42:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:42:40 | 200 | 19.623128288s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:43:44 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:43:44 | 200 |  6.429165086s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:43:44 launchpad ollama[1510]: time=2025-03-29T19:43:44.756-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.2 GiB"
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.466-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8941469696 required="6.2 GiB"
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.466-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.466-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.468-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45405"
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.468-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.468-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.468-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:43:45 launchpad ollama[152991]: INFO [main] build info | build=0 commit="unknown" tid="139653163413504" timestamp=1743302625
+Mar 29 19:43:45 launchpad ollama[152991]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139653163413504" timestamp=1743302625 total_threads=16
+Mar 29 19:43:45 launchpad ollama[152991]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45405" tid="139653163413504" timestamp=1743302625
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.719-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:43:45 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:43:45 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:43:45 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:43:45 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:43:46 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:43:46 launchpad ollama[152991]: INFO [main] model loaded | tid="139653163413504" timestamp=1743302626
+Mar 29 19:43:46 launchpad ollama[1510]: time=2025-03-29T19:43:46.723-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:43:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:43:46 | 200 |  2.333994117s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:43:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:43:47 | 200 |  167.006683ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.269-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.994-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8941469696 required="6.5 GiB"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.995-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.995-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.996-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34321"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.996-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.996-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.996-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:43:48 launchpad ollama[153025]: INFO [main] build info | build=0 commit="unknown" tid="140311399333888" timestamp=1743302628
+Mar 29 19:43:48 launchpad ollama[153025]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140311399333888" timestamp=1743302628 total_threads=16
+Mar 29 19:43:48 launchpad ollama[153025]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34321" tid="140311399333888" timestamp=1743302628
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:43:48 launchpad ollama[1510]: time=2025-03-29T19:43:48.247-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:43:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:43:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:43:48 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:43:48 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:43:49 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:43:49 launchpad ollama[153025]: INFO [main] model loaded | tid="140311399333888" timestamp=1743302629
+Mar 29 19:43:49 launchpad ollama[1510]: time=2025-03-29T19:43:49.252-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:44:04 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:44:04 | 200 | 17.402217395s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:47:10 launchpad ollama[153025]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1475 n_keep=4 n_left=2044 n_shift=1022 tid="140311399333888" timestamp=1743302830
+Mar 29 19:47:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:47:16 | 200 |  6.205132773s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:47:35 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:47:35 | 200 |      36.636µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.143-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.843-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8966701056 required="6.2 GiB"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.843-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.843-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.844-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38745"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.844-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.844-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.844-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:48:10 launchpad ollama[154151]: INFO [main] build info | build=0 commit="unknown" tid="139790732369920" timestamp=1743302890
+Mar 29 19:48:10 launchpad ollama[154151]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139790732369920" timestamp=1743302890 total_threads=16
+Mar 29 19:48:10 launchpad ollama[154151]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38745" tid="139790732369920" timestamp=1743302890
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:48:11 launchpad ollama[1510]: time=2025-03-29T19:48:11.095-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:48:11 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:48:11 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:48:11 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:48:11 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:48:11 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:48:11 launchpad ollama[154151]: INFO [main] model loaded | tid="139790732369920" timestamp=1743302891
+Mar 29 19:48:12 launchpad ollama[1510]: time=2025-03-29T19:48:12.098-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:48:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:48:16 | 200 |  6.364819376s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:50:00 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:50:00 | 200 |  3.466462233s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:50:29 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:50:29 | 200 |  6.391877077s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:51:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:49 | 200 |   176.22525ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:50 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:50 | 200 |  160.778236ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:50 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:50 | 200 |  786.856972ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:50 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:50 | 200 |   78.256203ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:51 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:51 | 200 |  161.909934ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:51 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:51 | 200 |  797.424571ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:52 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:52 | 200 |   78.491835ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:52 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:52 | 200 |  159.008679ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:52 launchpad ollama[1510]: time=2025-03-29T19:51:52.242-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=system
+Mar 29 19:51:52 launchpad ollama[1510]: time=2025-03-29T19:51:52.410-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.135-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8977448960 required="6.5 GiB"
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.135-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.136-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.137-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35249"
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.137-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.137-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.137-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:51:53 launchpad ollama[155156]: INFO [main] build info | build=0 commit="unknown" tid="140389238358016" timestamp=1743303113
+Mar 29 19:51:53 launchpad ollama[155156]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140389238358016" timestamp=1743303113 total_threads=16
+Mar 29 19:51:53 launchpad ollama[155156]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35249" tid="140389238358016" timestamp=1743303113
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.388-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:51:53 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:51:53 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:51:53 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:51:53 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:51:54 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:51:54 launchpad ollama[155156]: INFO [main] model loaded | tid="140389238358016" timestamp=1743303114
+Mar 29 19:51:54 launchpad ollama[1510]: time=2025-03-29T19:51:54.391-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:52:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:52:28 | 200 | 35.790191346s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:54:14 launchpad ollama[1510]: time=2025-03-29T19:54:14.889-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.593-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8977448960 required="6.2 GiB"
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.593-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.6 GiB" free_swap="68.9 GiB"
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.593-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.594-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44435"
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.594-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.594-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.594-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:54:15 launchpad ollama[155780]: INFO [main] build info | build=0 commit="unknown" tid="140376766713856" timestamp=1743303255
+Mar 29 19:54:15 launchpad ollama[155780]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140376766713856" timestamp=1743303255 total_threads=16
+Mar 29 19:54:15 launchpad ollama[155780]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44435" tid="140376766713856" timestamp=1743303255
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.846-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:54:15 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:54:15 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:54:15 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:54:15 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:54:16 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:54:16 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:54:16 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:54:16 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:54:16 launchpad ollama[155780]: INFO [main] model loaded | tid="140376766713856" timestamp=1743303256
+Mar 29 19:54:16 launchpad ollama[1510]: time=2025-03-29T19:54:16.850-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:54:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:17 | 200 |  2.332103187s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:17 | 200 |  180.036992ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:18 | 200 |  865.748366ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:18 | 200 |  136.299068ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:18 | 200 |  176.635973ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:19 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:19 | 200 |  766.638269ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:19 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:19 | 200 |    95.16046ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:19 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:19 | 200 |  177.320643ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:19 launchpad ollama[1510]: time=2025-03-29T19:54:19.534-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=system
+Mar 29 19:54:19 launchpad ollama[1510]: time=2025-03-29T19:54:19.703-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.421-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8977448960 required="6.5 GiB"
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.421-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.6 GiB" free_swap="68.9 GiB"
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.422-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.423-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42129"
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.423-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.423-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.423-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:54:20 launchpad ollama[155852]: INFO [main] build info | build=0 commit="unknown" tid="140250340904960" timestamp=1743303260
+Mar 29 19:54:20 launchpad ollama[155852]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140250340904960" timestamp=1743303260 total_threads=16
+Mar 29 19:54:20 launchpad ollama[155852]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42129" tid="140250340904960" timestamp=1743303260
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.674-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:54:20 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:54:20 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:54:20 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:54:20 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:54:21 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:54:21 launchpad ollama[155852]: INFO [main] model loaded | tid="140250340904960" timestamp=1743303261
+Mar 29 19:54:21 launchpad ollama[1510]: time=2025-03-29T19:54:21.678-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:54:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:40 | 200 | 21.020815256s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:58:24 launchpad ollama[1510]: time=2025-03-29T19:58:24.268-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.4 GiB"
+Mar 29 19:58:24 launchpad ollama[1510]: time=2025-03-29T19:58:24.999-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8977448960 required="6.2 GiB"
+Mar 29 19:58:24 launchpad ollama[1510]: time=2025-03-29T19:58:24.999-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.6 GiB" free_swap="68.9 GiB"
+Mar 29 19:58:24 launchpad ollama[1510]: time=2025-03-29T19:58:24.999-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:58:25 launchpad ollama[1510]: time=2025-03-29T19:58:25.000-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44513"
+Mar 29 19:58:25 launchpad ollama[1510]: time=2025-03-29T19:58:25.000-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:58:25 launchpad ollama[1510]: time=2025-03-29T19:58:25.000-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:58:25 launchpad ollama[1510]: time=2025-03-29T19:58:25.000-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:58:25 launchpad ollama[156905]: INFO [main] build info | build=0 commit="unknown" tid="140456813043712" timestamp=1743303505
+Mar 29 19:58:25 launchpad ollama[156905]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140456813043712" timestamp=1743303505 total_threads=16
+Mar 29 19:58:25 launchpad ollama[156905]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44513" tid="140456813043712" timestamp=1743303505
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:58:25 launchpad ollama[1510]: time=2025-03-29T19:58:25.251-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:58:25 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:58:25 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:58:25 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:58:25 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:58:25 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:58:26 launchpad ollama[156905]: INFO [main] model loaded | tid="140456813043712" timestamp=1743303506
+Mar 29 19:58:26 launchpad ollama[1510]: time=2025-03-29T19:58:26.255-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:58:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:26 | 200 |  2.382167762s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:26 | 200 |  201.386254ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:27 | 200 |  871.494809ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:27 | 200 |  119.866405ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:27 | 200 |  204.574165ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:28 | 200 |  600.398574ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:28 | 200 |  120.301771ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:28 | 200 |  162.348729ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:28 launchpad ollama[1510]: time=2025-03-29T19:58:28.845-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=system
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.016-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.766-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8977448960 required="6.5 GiB"
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.766-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.6 GiB" free_swap="68.9 GiB"
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.766-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.767-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44621"
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.767-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.767-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.767-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:58:29 launchpad ollama[156944]: INFO [main] build info | build=0 commit="unknown" tid="139723562520576" timestamp=1743303509
+Mar 29 19:58:29 launchpad ollama[156944]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139723562520576" timestamp=1743303509 total_threads=16
+Mar 29 19:58:29 launchpad ollama[156944]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44621" tid="139723562520576" timestamp=1743303509
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:58:30 launchpad ollama[1510]: time=2025-03-29T19:58:30.018-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:58:30 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:58:30 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:58:30 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:58:30 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:58:30 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:58:30 launchpad ollama[156944]: INFO [main] model loaded | tid="139723562520576" timestamp=1743303510
+Mar 29 19:58:31 launchpad ollama[1510]: time=2025-03-29T19:58:31.022-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:58:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:43 | 200 | 15.143637267s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 20:01:15 launchpad ollama[156944]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=2319 n_keep=4 n_left=2044 n_shift=1022 tid="139723562520576" timestamp=1743303675
+Mar 29 20:01:29 launchpad ollama[1510]: [GIN] 2025/03/29 - 20:01:29 | 200 | 14.849592956s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:14:54 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:14:54 | 200 |     623.106µs |       127.0.0.1 | GET      "/api/tags"
+Mar 30 11:14:54 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:14:54 | 200 |      31.653µs |       127.0.0.1 | GET      "/api/version"
+Mar 30 11:15:09 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:15:09 | 200 |      31.878µs |       127.0.0.1 | GET      "/api/version"
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.615-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8935112704 required="6.2 GiB"
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.615-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.615-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.617-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37551"
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.617-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.617-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.617-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 11:15:22 launchpad ollama[190865]: INFO [main] build info | build=0 commit="unknown" tid="140582430531584" timestamp=1743358522
+Mar 30 11:15:22 launchpad ollama[190865]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140582430531584" timestamp=1743358522 total_threads=16
+Mar 30 11:15:22 launchpad ollama[190865]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37551" tid="140582430531584" timestamp=1743358522
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.868-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 11:15:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 11:15:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 11:15:22 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 11:15:22 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 11:15:23 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 11:15:23 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 11:15:23 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 11:15:23 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 11:15:23 launchpad ollama[190865]: INFO [main] model loaded | tid="140582430531584" timestamp=1743358523
+Mar 30 11:15:23 launchpad ollama[1510]: time=2025-03-30T11:15:23.870-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 11:15:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:15:33 | 200 | 10.617418065s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:15:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:15:33 | 200 |  671.686201ms |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:15:35 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:15:35 | 200 |  2.190579097s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:18:12 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:18:12 | 200 | 10.393218154s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:21:51 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:21:51 | 200 |  9.948355464s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.607-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9020768256 required="6.2 GiB"
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.608-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.608-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.609-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38379"
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.609-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.609-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.610-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 11:28:46 launchpad ollama[194362]: INFO [main] build info | build=0 commit="unknown" tid="139712057430016" timestamp=1743359326
+Mar 30 11:28:46 launchpad ollama[194362]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139712057430016" timestamp=1743359326 total_threads=16
+Mar 30 11:28:46 launchpad ollama[194362]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38379" tid="139712057430016" timestamp=1743359326
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.860-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 11:28:46 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 11:28:46 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 11:28:46 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 11:28:46 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 11:28:47 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 11:28:47 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 11:28:47 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 11:28:47 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 11:28:47 launchpad ollama[194362]: INFO [main] model loaded | tid="139712057430016" timestamp=1743359327
+Mar 30 11:28:47 launchpad ollama[1510]: time=2025-03-30T11:28:47.863-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 11:28:59 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:28:59 | 200 | 12.679412756s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.346-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9045934080 required="6.2 GiB"
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.346-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.346-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.347-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35069"
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.347-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.347-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.348-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 11:35:36 launchpad ollama[196139]: INFO [main] build info | build=0 commit="unknown" tid="140001028354048" timestamp=1743359736
+Mar 30 11:35:36 launchpad ollama[196139]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140001028354048" timestamp=1743359736 total_threads=16
+Mar 30 11:35:36 launchpad ollama[196139]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35069" tid="140001028354048" timestamp=1743359736
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.599-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 11:35:36 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 11:35:36 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 11:35:36 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 11:35:36 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 11:35:37 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 11:35:37 launchpad ollama[196139]: INFO [main] model loaded | tid="140001028354048" timestamp=1743359737
+Mar 30 11:35:37 launchpad ollama[1510]: time=2025-03-30T11:35:37.603-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 11:35:45 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:35:45 | 200 |  9.316356775s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.822-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9045934080 required="6.2 GiB"
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.822-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.822-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.823-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33833"
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.824-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.824-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.824-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 11:43:46 launchpad ollama[198235]: INFO [main] build info | build=0 commit="unknown" tid="140325241839616" timestamp=1743360226
+Mar 30 11:43:46 launchpad ollama[198235]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140325241839616" timestamp=1743360226 total_threads=16
+Mar 30 11:43:46 launchpad ollama[198235]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33833" tid="140325241839616" timestamp=1743360226
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 11:43:47 launchpad ollama[1510]: time=2025-03-30T11:43:47.074-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 11:43:47 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 11:43:47 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 11:43:47 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 11:43:47 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 11:43:47 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 11:43:47 launchpad ollama[198235]: INFO [main] model loaded | tid="140325241839616" timestamp=1743360227
+Mar 30 11:43:48 launchpad ollama[1510]: time=2025-03-30T11:43:48.079-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 11:43:50 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:43:50 | 200 |  3.468874029s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:44:45 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:44:45 | 200 |  6.057648133s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.954-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9045934080 required="6.2 GiB"
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.954-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.955-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.956-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38283"
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.956-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.956-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.956-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 11:53:50 launchpad ollama[200808]: INFO [main] build info | build=0 commit="unknown" tid="139775037440000" timestamp=1743360830
+Mar 30 11:53:50 launchpad ollama[200808]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139775037440000" timestamp=1743360830 total_threads=16
+Mar 30 11:53:50 launchpad ollama[200808]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38283" tid="139775037440000" timestamp=1743360830
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 11:53:51 launchpad ollama[1510]: time=2025-03-30T11:53:51.207-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 11:53:51 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 11:53:51 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 11:53:51 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 11:53:51 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 11:53:51 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 11:53:51 launchpad ollama[200808]: INFO [main] model loaded | tid="139775037440000" timestamp=1743360831
+Mar 30 11:53:52 launchpad ollama[1510]: time=2025-03-30T11:53:52.210-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 11:54:03 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:54:03 | 200 | 12.881324152s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:57:29 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:57:29 | 200 |  5.018045448s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 12:01:12 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:01:12 | 200 |  5.891354226s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 12:02:55 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:02:55 | 200 | 11.031168326s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 12:03:18 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:03:18 | 200 |  5.128749077s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 12:03:43 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:03:43 | 200 |  3.159849674s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.021-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8995602432 required="6.2 GiB"
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.021-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="34.0 GiB" free_swap="68.9 GiB"
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.021-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.022-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44363"
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.023-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.023-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.023-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 12:58:17 launchpad ollama[217969]: INFO [main] build info | build=0 commit="unknown" tid="140634735337472" timestamp=1743364697
+Mar 30 12:58:17 launchpad ollama[217969]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140634735337472" timestamp=1743364697 total_threads=16
+Mar 30 12:58:17 launchpad ollama[217969]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44363" tid="140634735337472" timestamp=1743364697
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.274-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 12:58:17 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 12:58:17 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 12:58:17 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 12:58:17 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 12:58:17 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 12:58:18 launchpad ollama[217969]: INFO [main] model loaded | tid="140634735337472" timestamp=1743364698
+Mar 30 12:58:18 launchpad ollama[1510]: time=2025-03-30T12:58:18.278-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 12:58:22 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:58:22 | 200 |  5.918463234s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:58:26 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:58:26 | 200 |   3.34311065s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:32 | 200 |  904.989732ms |       127.0.0.1 | POST     "/api/chat"
+Mar 30 12:59:40 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:40 | 200 |  3.682284619s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:42 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:42 | 200 |  2.106195761s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:43 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:43 | 200 |   94.146459ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:43 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:43 | 200 |  174.747682ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:44 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:44 | 200 |  1.020546761s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:44 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:44 | 200 |   52.718223ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:44 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:44 | 200 |  174.778406ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:45 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:45 | 200 |  990.164803ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:51 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:51 | 200 |  6.282808147s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:00:51 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:00:51 | 200 |  5.588454751s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:00:55 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:00:55 | 200 |  4.103858218s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:01:12 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:12 | 200 |  188.608016ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:01:12 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:12 | 200 |  185.173926ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:01:13 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:13 | 200 |   1.10262918s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:01:17 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:17 | 200 |  3.974637544s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:01:49 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:49 | 200 |   1.67097826s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:01:50 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:50 | 200 |  185.531573ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:01:50 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:50 | 200 |  187.898384ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:01:51 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:51 | 200 |  770.498947ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:02:00 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:02:00 | 200 |  9.195115212s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:02:28 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:02:28 | 200 |  1.776794277s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:02:29 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:02:29 | 200 |  176.978784ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:02:29 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:02:29 | 200 |    174.6454ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:02:30 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:02:30 | 200 |  1.062071109s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:02:38 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:02:38 | 200 |  8.280355153s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:04:58 launchpad ollama[217969]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=4317 n_keep=24 n_left=2024 n_shift=1012 tid="140634735337472" timestamp=1743365098
+Mar 30 13:05:02 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:05:02 | 200 |  4.556235272s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:05:03 launchpad ollama[217969]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="140634735337472" timestamp=1743365103
+Mar 30 13:05:03 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:05:03 | 200 |  1.092054906s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:05:04 launchpad ollama[217969]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="140634735337472" timestamp=1743365104
+Mar 30 13:05:05 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:05:05 | 200 |  1.089213965s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:05:06 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:05:06 | 200 |  1.158635314s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:05:06 launchpad ollama[217969]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1302 n_keep=24 n_left=2024 n_shift=1012 tid="140634735337472" timestamp=1743365106
+Mar 30 13:05:12 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:05:12 | 200 |  5.933884693s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:08:25 launchpad ollama[217969]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1998 n_keep=24 n_left=2024 n_shift=1012 tid="140634735337472" timestamp=1743365305
+Mar 30 13:08:31 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:08:31 | 200 |  6.277962524s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:08:31 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:08:31 | 200 |  199.192105ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:08:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:08:32 | 200 |  200.257129ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:08:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:08:33 | 200 |  1.118850329s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:08:44 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:08:44 | 200 | 11.280558324s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:10:58 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:10:58 | 200 |  1.957155103s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:10:58 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:10:58 | 200 |  174.897519ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:10:59 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:10:59 | 200 |  177.076028ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:10:59 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:10:59 | 200 |  815.533281ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:11:09 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:11:09 | 200 |   9.72070433s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.773-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9020768256 required="6.2 GiB"
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.773-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.773-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.775-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45455"
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.775-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.775-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.775-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 13:16:26 launchpad ollama[222729]: INFO [main] build info | build=0 commit="unknown" tid="139857329553408" timestamp=1743365786
+Mar 30 13:16:26 launchpad ollama[222729]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139857329553408" timestamp=1743365786 total_threads=16
+Mar 30 13:16:26 launchpad ollama[222729]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45455" tid="139857329553408" timestamp=1743365786
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 13:16:27 launchpad ollama[1510]: time=2025-03-30T13:16:27.026-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 13:16:27 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 13:16:27 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 13:16:27 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 13:16:27 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 13:16:27 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 13:16:27 launchpad ollama[222729]: INFO [main] model loaded | tid="139857329553408" timestamp=1743365787
+Mar 30 13:16:28 launchpad ollama[1510]: time=2025-03-30T13:16:28.030-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 13:16:29 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:16:29 | 200 |  2.966369395s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:16:30 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:16:30 | 200 |  198.167659ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:16:30 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:16:30 | 200 |  199.286527ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:16:31 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:16:31 | 200 |  919.299764ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:16:40 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:16:40 | 200 |  8.881800137s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:18:50 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:18:50 | 200 |  200.994987ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:18:50 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:18:50 | 200 |  182.687511ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:18:51 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:18:51 | 200 |  1.212547742s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:18:58 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:18:58 | 200 |  6.346474242s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:21:36 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:21:36 | 200 |  213.977971ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:21:36 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:21:36 | 200 |  190.166012ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:21:37 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:21:37 | 200 |   1.13022637s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:21:49 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:21:49 | 200 | 12.000309524s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.222-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9045934080 required="6.2 GiB"
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.222-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.222-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.224-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42283"
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.224-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.224-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.224-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 13:31:31 launchpad ollama[226605]: INFO [main] build info | build=0 commit="unknown" tid="140490867326976" timestamp=1743366691
+Mar 30 13:31:31 launchpad ollama[226605]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140490867326976" timestamp=1743366691 total_threads=16
+Mar 30 13:31:31 launchpad ollama[226605]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42283" tid="140490867326976" timestamp=1743366691
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.474-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 13:31:31 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 13:31:31 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 13:31:31 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 13:31:31 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 13:31:32 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 13:31:32 launchpad ollama[226605]: INFO [main] model loaded | tid="140490867326976" timestamp=1743366692
+Mar 30 13:31:32 launchpad ollama[1510]: time=2025-03-30T13:31:32.478-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 13:31:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:31:32 | 200 |  1.618017075s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:31:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:31:32 | 200 |  218.929625ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:31:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:31:33 | 200 |  903.814406ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:31:45 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:31:45 | 200 |  11.67831139s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:35:30 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:35:30 | 200 |  215.015164ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:35:31 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:35:31 | 200 |  197.705893ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:35:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:35:32 | 200 |  1.150242635s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:35:43 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:35:43 | 200 | 11.022267554s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:37:55 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:37:55 | 200 |  194.491155ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:37:55 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:37:55 | 200 |  184.108833ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:37:56 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:37:56 | 200 |  1.094639872s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:38:08 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:38:08 | 200 | 12.487599999s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.007-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9019392000 required="6.2 GiB"
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.008-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.008-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.009-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35511"
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.009-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.009-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.009-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 13:44:22 launchpad ollama[229918]: INFO [main] build info | build=0 commit="unknown" tid="139861381177344" timestamp=1743367462
+Mar 30 13:44:22 launchpad ollama[229918]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139861381177344" timestamp=1743367462 total_threads=16
+Mar 30 13:44:22 launchpad ollama[229918]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35511" tid="139861381177344" timestamp=1743367462
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.260-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 13:44:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 13:44:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 13:44:22 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 13:44:22 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 13:44:22 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 13:44:23 launchpad ollama[229918]: INFO [main] model loaded | tid="139861381177344" timestamp=1743367463
+Mar 30 13:44:23 launchpad ollama[1510]: time=2025-03-30T13:44:23.265-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 13:44:23 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:44:23 | 200 |  1.631172672s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:44:23 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:44:23 | 200 |  148.745836ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:44:24 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:44:24 | 200 |  923.062233ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:44:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:44:33 | 200 |  8.958894716s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.115-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9045803008 required="6.2 GiB"
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.115-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.115-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.116-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43271"
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.117-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.117-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.117-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 13:53:56 launchpad ollama[232446]: INFO [main] build info | build=0 commit="unknown" tid="139733190529024" timestamp=1743368036
+Mar 30 13:53:56 launchpad ollama[232446]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139733190529024" timestamp=1743368036 total_threads=16
+Mar 30 13:53:56 launchpad ollama[232446]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43271" tid="139733190529024" timestamp=1743368036
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.368-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 13:53:56 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 13:53:56 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 13:53:56 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 13:53:56 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 13:53:57 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 13:53:57 launchpad ollama[232446]: INFO [main] model loaded | tid="139733190529024" timestamp=1743368037
+Mar 30 13:53:57 launchpad ollama[1510]: time=2025-03-30T13:53:57.372-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 13:53:57 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:53:57 | 200 |   1.70512428s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:53:57 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:53:57 | 200 |  258.535774ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:53:58 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:53:58 | 200 |  818.066094ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:54:07 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:54:07 | 200 |  8.317710889s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.156-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9044688896 required="6.2 GiB"
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.157-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.157-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.158-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43753"
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.158-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.158-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.158-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 14:08:30 launchpad ollama[236233]: INFO [main] build info | build=0 commit="unknown" tid="140200415158272" timestamp=1743368910
+Mar 30 14:08:30 launchpad ollama[236233]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140200415158272" timestamp=1743368910 total_threads=16
+Mar 30 14:08:30 launchpad ollama[236233]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43753" tid="140200415158272" timestamp=1743368910
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.409-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 14:08:30 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 14:08:30 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 14:08:30 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 14:08:30 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 14:08:31 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 14:08:31 launchpad ollama[236233]: INFO [main] model loaded | tid="140200415158272" timestamp=1743368911
+Mar 30 14:08:31 launchpad ollama[1510]: time=2025-03-30T14:08:31.414-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 14:08:31 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:08:31 | 200 |  1.752032178s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:08:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:08:32 | 200 |  307.782722ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:08:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:08:33 | 200 |  1.008237057s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:08:41 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:08:41 | 200 |  8.861937194s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.865-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9002483712 required="6.2 GiB"
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.865-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.865-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.866-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41317"
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.867-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.867-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.867-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 14:15:39 launchpad ollama[238117]: INFO [main] build info | build=0 commit="unknown" tid="140401679781888" timestamp=1743369339
+Mar 30 14:15:39 launchpad ollama[238117]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140401679781888" timestamp=1743369339 total_threads=16
+Mar 30 14:15:39 launchpad ollama[238117]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41317" tid="140401679781888" timestamp=1743369339
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 14:15:40 launchpad ollama[1510]: time=2025-03-30T14:15:40.118-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 14:15:40 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 14:15:40 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 14:15:40 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 14:15:40 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 14:15:40 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 14:15:40 launchpad ollama[238117]: INFO [main] model loaded | tid="140401679781888" timestamp=1743369340
+Mar 30 14:15:41 launchpad ollama[1510]: time=2025-03-30T14:15:41.123-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 14:15:41 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:15:41 | 200 |  1.673962631s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:15:41 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:15:41 | 200 |  214.491787ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:15:42 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:15:42 | 200 |  924.597862ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:15:50 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:15:50 | 200 |  8.197635357s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:20:42 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:20:42 | 200 |  242.350589ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:20:42 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:20:42 | 200 |  219.744923ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:20:43 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:20:43 | 200 |  1.109669094s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:20:53 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:20:53 | 200 |  9.705641534s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.356-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9044688896 required="6.2 GiB"
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.356-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.357-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.358-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43383"
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.358-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.358-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.358-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 14:31:22 launchpad ollama[242210]: INFO [main] build info | build=0 commit="unknown" tid="140680711970816" timestamp=1743370282
+Mar 30 14:31:22 launchpad ollama[242210]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140680711970816" timestamp=1743370282 total_threads=16
+Mar 30 14:31:22 launchpad ollama[242210]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43383" tid="140680711970816" timestamp=1743370282
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.609-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 14:31:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 14:31:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 14:31:22 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 14:31:22 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 14:31:23 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 14:31:23 launchpad ollama[242210]: INFO [main] model loaded | tid="140680711970816" timestamp=1743370283
+Mar 30 14:31:23 launchpad ollama[1510]: time=2025-03-30T14:31:23.612-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 14:31:23 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:31:23 | 200 |  1.721336635s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:31:24 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:31:24 | 200 |  227.394231ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:31:25 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:31:25 | 200 |   1.08918971s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:31:35 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:31:35 | 200 | 10.015729494s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.368-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8996454400 required="6.2 GiB"
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.368-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.368-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.369-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38611"
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.370-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.370-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.370-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 14:47:35 launchpad ollama[246746]: INFO [main] build info | build=0 commit="unknown" tid="140516085518336" timestamp=1743371255
+Mar 30 14:47:35 launchpad ollama[246746]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140516085518336" timestamp=1743371255 total_threads=16
+Mar 30 14:47:35 launchpad ollama[246746]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38611" tid="140516085518336" timestamp=1743371255
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.621-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 14:47:35 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 14:47:35 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 14:47:35 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 14:47:35 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 14:47:36 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 14:47:36 launchpad ollama[246746]: INFO [main] model loaded | tid="140516085518336" timestamp=1743371256
+Mar 30 14:47:36 launchpad ollama[1510]: time=2025-03-30T14:47:36.625-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 14:47:38 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:47:38 | 200 |  3.187421732s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:47:39 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:47:39 | 200 |   367.25905ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:47:39 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:47:39 | 200 |  364.747795ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:47:40 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:47:40 | 200 |  1.061008605s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:47:46 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:47:46 | 200 |  6.031404853s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.166-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9006940160 required="6.2 GiB"
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.166-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.166-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.168-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33717"
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.168-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.168-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.168-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 14:55:02 launchpad ollama[248655]: INFO [main] build info | build=0 commit="unknown" tid="140162681102336" timestamp=1743371702
+Mar 30 14:55:02 launchpad ollama[248655]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140162681102336" timestamp=1743371702 total_threads=16
+Mar 30 14:55:02 launchpad ollama[248655]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33717" tid="140162681102336" timestamp=1743371702
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.419-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 14:55:02 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 14:55:02 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 14:55:02 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 14:55:02 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 14:55:03 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 14:55:03 launchpad ollama[248655]: INFO [main] model loaded | tid="140162681102336" timestamp=1743371703
+Mar 30 14:55:03 launchpad ollama[1510]: time=2025-03-30T14:55:03.424-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 14:55:05 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:55:05 | 200 |  3.476752927s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:55:08 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:55:08 | 200 |  206.680476ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:55:08 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:55:08 | 200 |  205.091541ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:55:09 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:55:09 | 200 |  956.182999ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:55:16 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:55:16 | 200 |  7.199068594s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.198-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9044688896 required="6.2 GiB"
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.198-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.7 GiB" free_swap="68.9 GiB"
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.198-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.200-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37485"
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.200-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.200-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.200-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 15:04:52 launchpad ollama[251181]: INFO [main] build info | build=0 commit="unknown" tid="139670217154560" timestamp=1743372292
+Mar 30 15:04:52 launchpad ollama[251181]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139670217154560" timestamp=1743372292 total_threads=16
+Mar 30 15:04:52 launchpad ollama[251181]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37485" tid="139670217154560" timestamp=1743372292
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.451-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 15:04:52 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 15:04:52 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 15:04:52 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 15:04:52 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 15:04:53 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 15:04:53 launchpad ollama[251181]: INFO [main] model loaded | tid="139670217154560" timestamp=1743372293
+Mar 30 15:04:53 launchpad ollama[1510]: time=2025-03-30T15:04:53.454-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 15:04:53 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:04:53 | 200 |  1.714657288s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:04:53 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:04:53 | 200 |  266.660072ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:04:54 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:04:54 | 200 |  909.679115ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:05:00 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:05:00 | 200 |  5.889296542s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.464-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9002745856 required="6.2 GiB"
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.464-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.7 GiB" free_swap="68.9 GiB"
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.464-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.465-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42603"
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.466-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.466-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.466-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 15:18:51 launchpad ollama[254756]: INFO [main] build info | build=0 commit="unknown" tid="140247288848384" timestamp=1743373131
+Mar 30 15:18:51 launchpad ollama[254756]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140247288848384" timestamp=1743373131 total_threads=16
+Mar 30 15:18:51 launchpad ollama[254756]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42603" tid="140247288848384" timestamp=1743373131
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.717-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 15:18:51 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 15:18:51 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 15:18:51 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 15:18:51 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 15:18:52 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 15:18:52 launchpad ollama[254756]: INFO [main] model loaded | tid="140247288848384" timestamp=1743373132
+Mar 30 15:18:52 launchpad ollama[1510]: time=2025-03-30T15:18:52.721-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 15:18:53 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:18:53 | 200 |  1.729779035s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:18:53 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:18:53 | 200 |  285.014496ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:18:54 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:18:54 | 200 |  953.831686ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:19:02 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:19:02 | 200 |  7.843429909s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:21:40 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:21:40 | 200 |  203.782143ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:21:41 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:21:41 | 200 |  190.182538ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:21:42 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:21:42 | 200 |  1.162467695s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:21:48 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:21:48 | 200 |    6.0991206s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.003-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9044688896 required="6.2 GiB"
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.003-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.6 GiB" free_swap="68.9 GiB"
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.003-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.004-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39767"
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.005-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.005-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.005-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 15:32:54 launchpad ollama[258356]: INFO [main] build info | build=0 commit="unknown" tid="139839142899712" timestamp=1743373974
+Mar 30 15:32:54 launchpad ollama[258356]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139839142899712" timestamp=1743373974 total_threads=16
+Mar 30 15:32:54 launchpad ollama[258356]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39767" tid="139839142899712" timestamp=1743373974
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.255-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 15:32:54 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 15:32:54 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 15:32:54 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 15:32:54 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 15:32:54 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 15:32:55 launchpad ollama[258356]: INFO [main] model loaded | tid="139839142899712" timestamp=1743373975
+Mar 30 15:32:55 launchpad ollama[1510]: time=2025-03-30T15:32:55.259-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 15:32:55 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:32:55 | 200 |  1.716980171s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:32:55 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:32:55 | 200 |  257.183863ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:32:56 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:32:56 | 200 |  1.021002209s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:33:04 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:33:04 | 200 |  7.321296408s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.295-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9020899328 required="6.2 GiB"
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.295-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.6 GiB" free_swap="68.9 GiB"
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.295-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.296-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42135"
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.297-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.297-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.297-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 15:40:31 launchpad ollama[260348]: INFO [main] build info | build=0 commit="unknown" tid="140419643985920" timestamp=1743374431
+Mar 30 15:40:31 launchpad ollama[260348]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140419643985920" timestamp=1743374431 total_threads=16
+Mar 30 15:40:31 launchpad ollama[260348]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42135" tid="140419643985920" timestamp=1743374431
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.548-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 15:40:31 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 15:40:31 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 15:40:31 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 15:40:31 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 15:40:32 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 15:40:32 launchpad ollama[260348]: INFO [main] model loaded | tid="140419643985920" timestamp=1743374432
+Mar 30 15:40:32 launchpad ollama[1510]: time=2025-03-30T15:40:32.552-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 15:40:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:40:32 | 200 |  1.677300117s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:40:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:40:32 | 200 |  219.154515ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:40:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:40:33 | 200 |  998.378093ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:40:41 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:40:41 | 200 |  7.661978973s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.104-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9044688896 required="6.2 GiB"
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.104-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.7 GiB" free_swap="68.9 GiB"
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.104-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.105-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42937"
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.106-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.106-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.106-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 15:50:47 launchpad ollama[263048]: INFO [main] build info | build=0 commit="unknown" tid="140163381395456" timestamp=1743375047
+Mar 30 15:50:47 launchpad ollama[263048]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140163381395456" timestamp=1743375047 total_threads=16
+Mar 30 15:50:47 launchpad ollama[263048]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42937" tid="140163381395456" timestamp=1743375047
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.357-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 15:50:47 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 15:50:47 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 15:50:47 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 15:50:47 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 15:50:48 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 15:50:48 launchpad ollama[263048]: INFO [main] model loaded | tid="140163381395456" timestamp=1743375048
+Mar 30 15:50:48 launchpad ollama[1510]: time=2025-03-30T15:50:48.361-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 15:50:48 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:50:48 | 200 |  1.713678009s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:50:48 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:50:48 | 200 |  269.004212ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:50:49 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:50:49 | 200 |  1.020975522s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:51:00 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:51:00 | 200 | 10.493888459s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:54:54 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:54:54 | 200 |    1.579821ms |       127.0.0.1 | GET      "/api/tags"
+Mar 30 15:54:54 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:54:54 | 200 |      35.176µs |       127.0.0.1 | GET      "/api/version"
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.338-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9026863104 required="6.2 GiB"
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.338-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.338-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.340-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43533"
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.340-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.340-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.340-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 16:02:37 launchpad ollama[266350]: INFO [main] build info | build=0 commit="unknown" tid="140020290863104" timestamp=1743375757
+Mar 30 16:02:37 launchpad ollama[266350]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140020290863104" timestamp=1743375757 total_threads=16
+Mar 30 16:02:37 launchpad ollama[266350]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43533" tid="140020290863104" timestamp=1743375757
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.591-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 16:02:37 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 16:02:37 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 16:02:37 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 16:02:37 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 16:02:38 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 16:02:38 launchpad ollama[266350]: INFO [main] model loaded | tid="140020290863104" timestamp=1743375758
+Mar 30 16:02:38 launchpad ollama[1510]: time=2025-03-30T16:02:38.593-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 16:02:38 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:02:38 | 200 |  1.702624975s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 16:02:39 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:02:39 | 200 |  258.790518ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 16:02:40 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:02:40 | 200 |  1.003156361s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 16:02:51 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:02:51 | 200 |  11.65021323s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.800-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9001697280 required="6.2 GiB"
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.800-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.800-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.801-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46001"
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.801-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.801-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.802-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 16:11:56 launchpad ollama[268772]: INFO [main] build info | build=0 commit="unknown" tid="140115138469888" timestamp=1743376316
+Mar 30 16:11:56 launchpad ollama[268772]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140115138469888" timestamp=1743376316 total_threads=16
+Mar 30 16:11:56 launchpad ollama[268772]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46001" tid="140115138469888" timestamp=1743376316
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 16:11:57 launchpad ollama[1510]: time=2025-03-30T16:11:57.053-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 16:11:57 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 16:11:57 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 16:11:57 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 16:11:57 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 16:11:57 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 16:11:57 launchpad ollama[268772]: INFO [main] model loaded | tid="140115138469888" timestamp=1743376317
+Mar 30 16:11:58 launchpad ollama[1510]: time=2025-03-30T16:11:58.056-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 16:11:58 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:11:58 | 200 |  1.667946555s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 16:11:58 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:11:58 | 200 |   218.09716ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 16:11:59 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:11:59 | 200 |  992.509671ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 16:12:07 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:12:07 | 200 |  8.252179407s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 16:17:07 launchpad ollama[1510]: cuda driver library failed to get device context 2time=2025-03-30T16:17:07.940-07:00 level=WARN source=gpu.go:400 msg="error looking up nvidia GPU memory"
+Mar 30 17:37:01 launchpad ollama[1510]: [GIN] 2025/03/30 - 17:37:01 | 200 |     576.005µs |       127.0.0.1 | GET      "/api/tags"
+Mar 30 17:37:01 launchpad ollama[1510]: [GIN] 2025/03/30 - 17:37:01 | 200 |      27.907µs |       127.0.0.1 | GET      "/api/version"
+Apr 02 12:13:56 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 12:13:56 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 12:13:56 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 12:13:56 launchpad systemd[1]: ollama.service: Consumed 31min 47.435s CPU time, 6G memory peak, 4K memory swap peak, 4.3G read from disk, 5G written to disk.
+-- Boot a52400493ab945b0aa668c54f0abba5c --
+Apr 02 12:14:30 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 12:14:30 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 12:14:31 launchpad ollama[1515]: 2025/04/02 12:14:31 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 12:14:31 launchpad ollama[1515]: time=2025-04-02T12:14:31.078-07:00 level=INFO source=images.go:753 msg="total blobs: 34"
+Apr 02 12:14:31 launchpad ollama[1515]: time=2025-04-02T12:14:31.089-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 1"
+Apr 02 12:14:31 launchpad ollama[1515]: time=2025-04-02T12:14:31.091-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 12:14:31 launchpad ollama[1515]: time=2025-04-02T12:14:31.093-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2075395292/runners
+Apr 02 12:14:34 launchpad ollama[1515]: time=2025-04-02T12:14:34.067-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Apr 02 12:14:34 launchpad ollama[1515]: time=2025-04-02T12:14:34.068-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 12:14:34 launchpad ollama[1515]: time=2025-04-02T12:14:34.068-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:14:34 launchpad ollama[1515]: time=2025-04-02T12:14:34.069-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:14:34 launchpad ollama[1515]: time=2025-04-02T12:14:34.069-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:14:34 launchpad ollama[1515]: time=2025-04-02T12:14:34.303-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 02 12:18:17 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 12:18:17 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 12:18:17 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 12:18:17 launchpad systemd[1]: ollama.service: Consumed 3.402s CPU time, 787.1M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot c63e1f966157468781dc04a3cb210db4 --
+Apr 02 12:18:54 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 12:18:54 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 12:18:54 launchpad ollama[1515]: 2025/04/02 12:18:54 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 12:18:54 launchpad ollama[1515]: time=2025-04-02T12:18:54.676-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 02 12:18:54 launchpad ollama[1515]: time=2025-04-02T12:18:54.687-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 02 12:18:54 launchpad ollama[1515]: time=2025-04-02T12:18:54.689-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 12:18:54 launchpad ollama[1515]: time=2025-04-02T12:18:54.690-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1727149981/runners
+Apr 02 12:18:57 launchpad ollama[1515]: time=2025-04-02T12:18:57.682-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 02 12:18:57 launchpad ollama[1515]: time=2025-04-02T12:18:57.683-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 12:18:57 launchpad ollama[1515]: time=2025-04-02T12:18:57.684-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:18:57 launchpad ollama[1515]: time=2025-04-02T12:18:57.684-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:18:57 launchpad ollama[1515]: time=2025-04-02T12:18:57.684-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:19:05 launchpad ollama[1515]: time=2025-04-02T12:19:05.616-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 02 12:20:44 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 12:20:45 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 12:20:45 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 12:20:45 launchpad systemd[1]: ollama.service: Consumed 11.356s CPU time, 787.2M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 260ec0b4bc2a4473a8dfa956ac0f6f03 --
+Apr 02 12:21:20 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 12:21:20 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 12:21:20 launchpad ollama[1509]: 2025/04/02 12:21:20 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 12:21:21 launchpad ollama[1509]: time=2025-04-02T12:21:21.003-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 02 12:21:21 launchpad ollama[1509]: time=2025-04-02T12:21:21.015-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 02 12:21:21 launchpad ollama[1509]: time=2025-04-02T12:21:21.017-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 12:21:21 launchpad ollama[1509]: time=2025-04-02T12:21:21.018-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3567153864/runners
+Apr 02 12:21:24 launchpad ollama[1509]: time=2025-04-02T12:21:24.001-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 02 12:21:24 launchpad ollama[1509]: time=2025-04-02T12:21:24.003-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 12:21:24 launchpad ollama[1509]: time=2025-04-02T12:21:24.003-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:21:24 launchpad ollama[1509]: time=2025-04-02T12:21:24.004-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:21:24 launchpad ollama[1509]: time=2025-04-02T12:21:24.004-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:21:24 launchpad ollama[1509]: time=2025-04-02T12:21:24.234-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 02 12:37:00 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 12:37:00 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 12:37:00 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 12:37:00 launchpad systemd[1]: ollama.service: Consumed 3.421s CPU time, 787.5M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 173713d559934faf95ac519561197f5f --
+Apr 02 12:37:32 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 12:37:32 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 12:37:32 launchpad ollama[1512]: 2025/04/02 12:37:32 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 12:37:32 launchpad ollama[1512]: time=2025-04-02T12:37:32.497-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 02 12:37:32 launchpad ollama[1512]: time=2025-04-02T12:37:32.507-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 02 12:37:32 launchpad ollama[1512]: time=2025-04-02T12:37:32.508-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 12:37:32 launchpad ollama[1512]: time=2025-04-02T12:37:32.511-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1619976437/runners
+Apr 02 12:37:35 launchpad ollama[1512]: time=2025-04-02T12:37:35.554-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 02 12:37:35 launchpad ollama[1512]: time=2025-04-02T12:37:35.555-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 12:37:35 launchpad ollama[1512]: time=2025-04-02T12:37:35.555-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:37:35 launchpad ollama[1512]: time=2025-04-02T12:37:35.556-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:37:35 launchpad ollama[1512]: time=2025-04-02T12:37:35.556-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:37:35 launchpad ollama[1512]: time=2025-04-02T12:37:35.770-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 02 13:05:03 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 13:05:03 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 13:05:03 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 13:05:03 launchpad systemd[1]: ollama.service: Consumed 3.489s CPU time, 787.1M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot e700071ab6664809b3f524d7cf5fb9ac --
+Apr 02 13:05:37 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 13:05:37 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 13:05:37 launchpad ollama[1556]: 2025/04/02 13:05:37 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 13:05:37 launchpad ollama[1556]: time=2025-04-02T13:05:37.349-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 02 13:05:37 launchpad ollama[1556]: time=2025-04-02T13:05:37.359-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 02 13:05:37 launchpad ollama[1556]: time=2025-04-02T13:05:37.362-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 13:05:37 launchpad ollama[1556]: time=2025-04-02T13:05:37.364-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2881439680/runners
+Apr 02 13:05:40 launchpad ollama[1556]: time=2025-04-02T13:05:40.399-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 02 13:05:40 launchpad ollama[1556]: time=2025-04-02T13:05:40.400-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 13:05:40 launchpad ollama[1556]: time=2025-04-02T13:05:40.400-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 13:05:40 launchpad ollama[1556]: time=2025-04-02T13:05:40.400-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 13:05:40 launchpad ollama[1556]: time=2025-04-02T13:05:40.400-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 13:05:40 launchpad ollama[1556]: time=2025-04-02T13:05:40.631-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 02 13:27:38 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 13:27:38 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 13:27:38 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 13:27:38 launchpad systemd[1]: ollama.service: Consumed 3.487s CPU time, 788.2M memory peak, 234.6M read from disk, 508.1M written to disk.
+-- Boot f51e83401288435294bac45db37486d6 --
+Apr 02 13:28:13 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 13:28:13 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 13:28:14 launchpad ollama[1547]: 2025/04/02 13:28:14 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 13:28:14 launchpad ollama[1547]: time=2025-04-02T13:28:14.091-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 02 13:28:14 launchpad ollama[1547]: time=2025-04-02T13:28:14.099-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 02 13:28:14 launchpad ollama[1547]: time=2025-04-02T13:28:14.100-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 13:28:14 launchpad ollama[1547]: time=2025-04-02T13:28:14.102-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3883982625/runners
+Apr 02 13:28:17 launchpad ollama[1547]: time=2025-04-02T13:28:17.071-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 02 13:28:17 launchpad ollama[1547]: time=2025-04-02T13:28:17.072-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 13:28:17 launchpad ollama[1547]: time=2025-04-02T13:28:17.072-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 13:28:17 launchpad ollama[1547]: time=2025-04-02T13:28:17.073-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 13:28:17 launchpad ollama[1547]: time=2025-04-02T13:28:17.073-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 13:28:17 launchpad ollama[1547]: time=2025-04-02T13:28:17.305-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 02 14:00:56 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 14:00:56 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 14:00:56 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 14:00:56 launchpad systemd[1]: ollama.service: Consumed 3.426s CPU time, 787.5M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot cf8eb89e6cc84de0ae5317ce0068970d --
+Apr 02 14:01:27 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 14:01:27 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 14:01:27 launchpad ollama[1545]: 2025/04/02 14:01:27 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 14:01:27 launchpad ollama[1545]: time=2025-04-02T14:01:27.831-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 02 14:01:27 launchpad ollama[1545]: time=2025-04-02T14:01:27.844-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 02 14:01:27 launchpad ollama[1545]: time=2025-04-02T14:01:27.846-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 14:01:27 launchpad ollama[1545]: time=2025-04-02T14:01:27.849-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3147859582/runners
+Apr 02 14:01:30 launchpad ollama[1545]: time=2025-04-02T14:01:30.822-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 02 14:01:30 launchpad ollama[1545]: time=2025-04-02T14:01:30.824-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 14:01:30 launchpad ollama[1545]: time=2025-04-02T14:01:30.824-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 14:01:30 launchpad ollama[1545]: time=2025-04-02T14:01:30.825-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 14:01:30 launchpad ollama[1545]: time=2025-04-02T14:01:30.825-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 14:01:31 launchpad ollama[1545]: time=2025-04-02T14:01:31.062-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 03 10:57:05 launchpad ollama[1545]: [GIN] 2025/04/03 - 10:57:05 | 200 |     3.99064ms |       127.0.0.1 | GET      "/api/tags"
+Apr 03 10:57:06 launchpad ollama[1545]: [GIN] 2025/04/03 - 10:57:06 | 200 |     620.572µs |       127.0.0.1 | GET      "/api/tags"
+Apr 03 10:57:06 launchpad ollama[1545]: [GIN] 2025/04/03 - 10:57:06 | 200 |     577.383µs |       127.0.0.1 | GET      "/api/version"
+-- Boot 85ddb90a318d4b65b30a92b1d80fef3e --
+Apr 08 10:18:56 launchpad systemd[1]: Starting Server for local large language models...
+Apr 08 10:18:56 launchpad systemd[1]: Started Server for local large language models.
+Apr 08 10:18:56 launchpad ollama[1579]: 2025/04/08 10:18:56 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 08 10:18:56 launchpad ollama[1579]: time=2025-04-08T10:18:56.380-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 08 10:18:56 launchpad ollama[1579]: time=2025-04-08T10:18:56.396-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 08 10:18:56 launchpad ollama[1579]: time=2025-04-08T10:18:56.399-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 08 10:18:56 launchpad ollama[1579]: time=2025-04-08T10:18:56.401-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1454344761/runners
+Apr 08 10:19:04 launchpad ollama[1579]: time=2025-04-08T10:19:04.330-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 08 10:19:04 launchpad ollama[1579]: time=2025-04-08T10:19:04.331-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 08 10:19:04 launchpad ollama[1579]: time=2025-04-08T10:19:04.331-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 10:19:04 launchpad ollama[1579]: time=2025-04-08T10:19:04.332-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 10:19:04 launchpad ollama[1579]: time=2025-04-08T10:19:04.332-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 10:19:04 launchpad ollama[1579]: time=2025-04-08T10:19:04.573-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 08 17:20:02 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 08 17:20:02 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 08 17:20:02 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 08 17:20:02 launchpad systemd[1]: ollama.service: Consumed 4.139s CPU time, 786.8M memory peak, 234.2M read from disk, 508.1M written to disk.
+Apr 08 17:20:07 launchpad systemd[1]: Starting Server for local large language models...
+Apr 08 17:20:07 launchpad systemd[1]: Started Server for local large language models.
+Apr 08 17:20:07 launchpad ollama[474753]: 2025/04/08 17:20:07 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 08 17:20:07 launchpad ollama[474753]: time=2025-04-08T17:20:07.723-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 08 17:20:07 launchpad ollama[474753]: time=2025-04-08T17:20:07.727-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 08 17:20:07 launchpad ollama[474753]: time=2025-04-08T17:20:07.729-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 08 17:20:07 launchpad ollama[474753]: time=2025-04-08T17:20:07.730-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2879337735/runners
+Apr 08 17:20:10 launchpad ollama[474753]: time=2025-04-08T17:20:10.910-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 08 17:20:10 launchpad ollama[474753]: time=2025-04-08T17:20:10.911-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 08 17:20:10 launchpad ollama[474753]: time=2025-04-08T17:20:10.911-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 17:20:10 launchpad ollama[474753]: time=2025-04-08T17:20:10.911-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 17:20:10 launchpad ollama[474753]: time=2025-04-08T17:20:10.911-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 17:20:11 launchpad ollama[474753]: time=2025-04-08T17:20:11.142-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="8.9 GiB"
+Apr 08 17:22:12 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 08 17:22:13 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 08 17:22:13 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 08 17:22:13 launchpad systemd[1]: ollama.service: Consumed 3.653s CPU time, 775.7M memory peak, 223.1M read from disk, 508.1M written to disk.
+-- Boot 553c6e5aa616434a99d28581f60a0555 --
+Apr 08 17:22:46 launchpad systemd[1]: Starting Server for local large language models...
+Apr 08 17:22:47 launchpad systemd[1]: Started Server for local large language models.
+Apr 08 17:22:47 launchpad ollama[1543]: 2025/04/08 17:22:47 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 08 17:22:47 launchpad ollama[1543]: time=2025-04-08T17:22:47.583-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 08 17:22:47 launchpad ollama[1543]: time=2025-04-08T17:22:47.593-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 08 17:22:47 launchpad ollama[1543]: time=2025-04-08T17:22:47.594-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 08 17:22:47 launchpad ollama[1543]: time=2025-04-08T17:22:47.596-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3672531136/runners
+Apr 08 17:22:50 launchpad ollama[1543]: time=2025-04-08T17:22:50.636-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 08 17:22:50 launchpad ollama[1543]: time=2025-04-08T17:22:50.637-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 08 17:22:50 launchpad ollama[1543]: time=2025-04-08T17:22:50.637-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 17:22:50 launchpad ollama[1543]: time=2025-04-08T17:22:50.637-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 17:22:50 launchpad ollama[1543]: time=2025-04-08T17:22:50.637-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 17:22:50 launchpad ollama[1543]: time=2025-04-08T17:22:50.855-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 08 17:25:11 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 08 17:25:11 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 08 17:25:11 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 08 17:25:11 launchpad systemd[1]: ollama.service: Consumed 3.477s CPU time, 786.7M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot 8d49b51a62474ba0916c4bd79a5b6d43 --
+Apr 09 09:13:24 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 09:13:24 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 09:13:24 launchpad ollama[1593]: 2025/04/09 09:13:24 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 09:13:24 launchpad ollama[1593]: time=2025-04-09T09:13:24.554-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 09:13:24 launchpad ollama[1593]: time=2025-04-09T09:13:24.566-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 09:13:24 launchpad ollama[1593]: time=2025-04-09T09:13:24.568-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 09:13:24 launchpad ollama[1593]: time=2025-04-09T09:13:24.570-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama692572085/runners
+Apr 09 09:13:27 launchpad ollama[1593]: time=2025-04-09T09:13:27.685-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Apr 09 09:13:27 launchpad ollama[1593]: time=2025-04-09T09:13:27.685-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 09:13:27 launchpad ollama[1593]: time=2025-04-09T09:13:27.686-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:13:27 launchpad ollama[1593]: time=2025-04-09T09:13:27.686-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:13:27 launchpad ollama[1593]: time=2025-04-09T09:13:27.686-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:25:11 launchpad systemd[1]: Stopping Server for local large language models...
+-- Boot c0edc8fdb2134a4e89eaaf392e7ef33e --
+Apr 09 09:27:15 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 09:27:15 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 09:27:15 launchpad ollama[1548]: 2025/04/09 09:27:15 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 09:27:15 launchpad ollama[1548]: time=2025-04-09T09:27:15.346-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 09:27:15 launchpad ollama[1548]: time=2025-04-09T09:27:15.355-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 09:27:15 launchpad ollama[1548]: time=2025-04-09T09:27:15.355-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 09:27:15 launchpad ollama[1548]: time=2025-04-09T09:27:15.357-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3891515531/runners
+Apr 09 09:27:18 launchpad ollama[1548]: time=2025-04-09T09:27:18.381-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 09:27:18 launchpad ollama[1548]: time=2025-04-09T09:27:18.382-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 09:27:18 launchpad ollama[1548]: time=2025-04-09T09:27:18.382-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:27:18 launchpad ollama[1548]: time=2025-04-09T09:27:18.383-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:27:18 launchpad ollama[1548]: time=2025-04-09T09:27:18.383-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:27:18 launchpad ollama[1548]: time=2025-04-09T09:27:18.602-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 09:40:38 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 09:40:38 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 09:40:38 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 09:40:38 launchpad systemd[1]: ollama.service: Consumed 3.467s CPU time, 787.5M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 69a7e7ab69354669831fa63c8712a136 --
+Apr 09 09:41:09 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 09:41:10 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 09:41:10 launchpad ollama[1546]: 2025/04/09 09:41:10 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 09:41:10 launchpad ollama[1546]: time=2025-04-09T09:41:10.156-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 09:41:10 launchpad ollama[1546]: time=2025-04-09T09:41:10.166-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 09:41:10 launchpad ollama[1546]: time=2025-04-09T09:41:10.167-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 09:41:10 launchpad ollama[1546]: time=2025-04-09T09:41:10.169-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2258545132/runners
+Apr 09 09:41:13 launchpad ollama[1546]: time=2025-04-09T09:41:13.200-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 09 09:41:13 launchpad ollama[1546]: time=2025-04-09T09:41:13.200-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 09:41:13 launchpad ollama[1546]: time=2025-04-09T09:41:13.200-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:41:13 launchpad ollama[1546]: time=2025-04-09T09:41:13.201-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:41:13 launchpad ollama[1546]: time=2025-04-09T09:41:13.201-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:41:13 launchpad ollama[1546]: time=2025-04-09T09:41:13.415-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 09:41:24 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 09:41:24 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 09:41:24 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 09:41:24 launchpad systemd[1]: ollama.service: Consumed 3.471s CPU time, 786.9M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 4610dee92c8648ad8a9fc7a26d1a4fde --
+Apr 09 11:10:55 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 11:10:55 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 11:10:55 launchpad ollama[1628]: 2025/04/09 11:10:55 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 11:10:55 launchpad ollama[1628]: time=2025-04-09T11:10:55.994-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 11:10:56 launchpad ollama[1628]: time=2025-04-09T11:10:56.002-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 11:10:56 launchpad ollama[1628]: time=2025-04-09T11:10:56.003-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 11:10:56 launchpad ollama[1628]: time=2025-04-09T11:10:56.005-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1412616515/runners
+Apr 09 11:10:59 launchpad ollama[1628]: time=2025-04-09T11:10:59.022-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 11:10:59 launchpad ollama[1628]: time=2025-04-09T11:10:59.023-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 11:10:59 launchpad ollama[1628]: time=2025-04-09T11:10:59.023-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 11:10:59 launchpad ollama[1628]: time=2025-04-09T11:10:59.023-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 11:10:59 launchpad ollama[1628]: time=2025-04-09T11:10:59.023-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 11:10:59 launchpad ollama[1628]: time=2025-04-09T11:10:59.245-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 10:33:03 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:33:03 | 200 |      3.9112ms |       127.0.0.1 | GET      "/api/tags"
+Apr 09 10:33:03 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:33:03 | 200 |     615.805µs |       127.0.0.1 | GET      "/api/tags"
+Apr 09 10:33:03 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:33:03 | 200 |     567.133µs |       127.0.0.1 | GET      "/api/version"
+Apr 09 10:33:12 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:33:12 | 200 |      26.861µs |       127.0.0.1 | GET      "/api/version"
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.550-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.697-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10101260288 required="9.2 GiB"
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.697-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.698-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.699-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1412616515/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 39735"
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.699-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.699-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.700-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 09 10:34:21 launchpad ollama[8831]: INFO [main] build info | build=0 commit="unknown" tid="140624111030272" timestamp=1744220061
+Apr 09 10:34:21 launchpad ollama[8831]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140624111030272" timestamp=1744220061 total_threads=16
+Apr 09 10:34:21 launchpad ollama[8831]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39735" tid="140624111030272" timestamp=1744220061
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - type  f32:   81 tensors
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - type q4_0:  281 tensors
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - type q6_K:    1 tensors
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_vocab: special tokens cache size = 3
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: format           = GGUF V2
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: arch             = llama
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: vocab type       = SPM
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_vocab          = 32016
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_merges         = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: vocab_only       = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_embd           = 5120
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_layer          = 40
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_head           = 40
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_head_kv        = 40
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_rot            = 128
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_swa            = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_gqa            = 1
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_ff             = 13824
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_expert         = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_expert_used    = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: causal attn      = 1
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: pooling type     = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: rope type        = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: rope scaling     = linear
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: freq_scale_train = 1
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: ssm_d_state      = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: model type       = 13B
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: model ftype      = Q4_0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: model params     = 13.02 B
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: general.name     = codellama
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: BOS token        = 1 ''
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: EOS token        = 2 ''
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: UNK token        = 0 ''
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: max token length = 48
+Apr 09 10:34:21 launchpad ollama[1628]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 09 10:34:21 launchpad ollama[1628]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 09 10:34:21 launchpad ollama[1628]: ggml_cuda_init: found 1 CUDA devices:
+Apr 09 10:34:21 launchpad ollama[1628]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.951-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 09 10:34:29 launchpad ollama[1628]: llm_load_tensors: offloading 40 repeating layers to GPU
+Apr 09 10:34:29 launchpad ollama[1628]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 09 10:34:29 launchpad ollama[1628]: llm_load_tensors: offloaded 41/41 layers to GPU
+Apr 09 10:34:29 launchpad ollama[1628]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Apr 09 10:34:29 launchpad ollama[1628]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: n_ctx      = 2048
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: n_batch    = 512
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: n_ubatch   = 512
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: flash_attn = 0
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: freq_scale = 1
+Apr 09 10:34:30 launchpad ollama[1628]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: graph nodes  = 1286
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: graph splits = 2
+Apr 09 10:34:30 launchpad ollama[8831]: INFO [main] model loaded | tid="140624111030272" timestamp=1744220070
+Apr 09 10:34:30 launchpad ollama[1628]: time=2025-04-09T10:34:30.727-07:00 level=INFO source=server.go:626 msg="llama runner started in 9.03 seconds"
+Apr 09 10:34:41 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:34:41 | 200 | 19.792110296s |       127.0.0.1 | POST     "/api/chat"
+Apr 09 10:34:41 launchpad ollama[1628]: time=2025-04-09T10:34:41.398-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 09 10:34:42 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:34:42 | 200 |  1.144799868s |       127.0.0.1 | POST     "/api/chat"
+Apr 09 10:34:42 launchpad ollama[1628]: time=2025-04-09T10:34:42.577-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 09 10:34:48 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:34:48 | 200 |  6.182628008s |       127.0.0.1 | POST     "/api/chat"
+Apr 09 10:38:02 launchpad ollama[1628]: time=2025-04-09T10:38:02.471-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 09 10:38:17 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:38:17 | 200 | 15.421078589s |       127.0.0.1 | POST     "/api/chat"
+Apr 09 12:10:22 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 12:10:22 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 12:10:22 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 12:10:22 launchpad systemd[1]: ollama.service: Consumed 39.260s CPU time, 7.9G memory peak, 7.1G read from disk, 508.1M written to disk.
+-- Boot 679bf54354244e11892dcfefa494c9e4 --
+Apr 09 12:11:08 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 12:11:08 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 12:11:08 launchpad ollama[1544]: 2025/04/09 12:11:08 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 12:11:08 launchpad ollama[1544]: time=2025-04-09T12:11:08.176-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 12:11:08 launchpad ollama[1544]: time=2025-04-09T12:11:08.184-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 12:11:08 launchpad ollama[1544]: time=2025-04-09T12:11:08.185-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 12:11:08 launchpad ollama[1544]: time=2025-04-09T12:11:08.187-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2862335710/runners
+Apr 09 12:11:11 launchpad ollama[1544]: time=2025-04-09T12:11:11.229-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 09 12:11:11 launchpad ollama[1544]: time=2025-04-09T12:11:11.229-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 12:11:11 launchpad ollama[1544]: time=2025-04-09T12:11:11.229-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:11:11 launchpad ollama[1544]: time=2025-04-09T12:11:11.230-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:11:11 launchpad ollama[1544]: time=2025-04-09T12:11:11.230-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:11:11 launchpad ollama[1544]: time=2025-04-09T12:11:11.443-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 12:16:16 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 12:16:16 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 12:16:16 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 12:16:16 launchpad systemd[1]: ollama.service: Consumed 3.478s CPU time, 787.1M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 7c4ea448b8b34058ab6d95569f0a0445 --
+Apr 09 12:16:53 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 12:16:53 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 12:16:53 launchpad ollama[1553]: 2025/04/09 12:16:53 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 12:16:53 launchpad ollama[1553]: time=2025-04-09T12:16:53.865-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 12:16:53 launchpad ollama[1553]: time=2025-04-09T12:16:53.875-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 12:16:53 launchpad ollama[1553]: time=2025-04-09T12:16:53.876-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 12:16:53 launchpad ollama[1553]: time=2025-04-09T12:16:53.878-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3780608611/runners
+Apr 09 12:16:56 launchpad ollama[1553]: time=2025-04-09T12:16:56.861-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 09 12:16:56 launchpad ollama[1553]: time=2025-04-09T12:16:56.862-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 12:16:56 launchpad ollama[1553]: time=2025-04-09T12:16:56.862-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:16:56 launchpad ollama[1553]: time=2025-04-09T12:16:56.863-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:16:56 launchpad ollama[1553]: time=2025-04-09T12:16:56.863-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:16:58 launchpad ollama[1553]: time=2025-04-09T12:16:58.178-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 09 12:17:15 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 12:17:15 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 12:17:15 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 12:17:15 launchpad systemd[1]: ollama.service: Consumed 5.345s CPU time, 787.9M memory peak, 234.4M read from disk, 508.1M written to disk.
+-- Boot 7042a4bf34f746e0ae0b9b7c8b0372a8 --
+Apr 09 12:17:47 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 12:17:47 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 12:17:47 launchpad ollama[1552]: 2025/04/09 12:17:47 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 12:17:47 launchpad ollama[1552]: time=2025-04-09T12:17:47.381-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 12:17:47 launchpad ollama[1552]: time=2025-04-09T12:17:47.393-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 12:17:47 launchpad ollama[1552]: time=2025-04-09T12:17:47.394-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 12:17:47 launchpad ollama[1552]: time=2025-04-09T12:17:47.396-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1680189345/runners
+Apr 09 12:17:50 launchpad ollama[1552]: time=2025-04-09T12:17:50.385-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 12:17:50 launchpad ollama[1552]: time=2025-04-09T12:17:50.386-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 12:17:50 launchpad ollama[1552]: time=2025-04-09T12:17:50.386-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:17:50 launchpad ollama[1552]: time=2025-04-09T12:17:50.387-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:17:50 launchpad ollama[1552]: time=2025-04-09T12:17:50.387-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:17:50 launchpad ollama[1552]: time=2025-04-09T12:17:50.616-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 12:37:35 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 12:37:35 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 12:37:35 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 12:37:35 launchpad systemd[1]: ollama.service: Consumed 3.434s CPU time, 786.9M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot c987ebd8bf4144d78c265f5bb5b9f026 --
+Apr 09 13:48:22 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 13:48:22 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 13:48:23 launchpad ollama[1555]: 2025/04/09 13:48:23 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 13:48:23 launchpad ollama[1555]: time=2025-04-09T13:48:23.041-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 13:48:23 launchpad ollama[1555]: time=2025-04-09T13:48:23.052-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 13:48:23 launchpad ollama[1555]: time=2025-04-09T13:48:23.053-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 13:48:23 launchpad ollama[1555]: time=2025-04-09T13:48:23.054-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama613101396/runners
+Apr 09 13:48:26 launchpad ollama[1555]: time=2025-04-09T13:48:26.045-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 13:48:26 launchpad ollama[1555]: time=2025-04-09T13:48:26.046-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 13:48:26 launchpad ollama[1555]: time=2025-04-09T13:48:26.046-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:48:26 launchpad ollama[1555]: time=2025-04-09T13:48:26.046-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:48:26 launchpad ollama[1555]: time=2025-04-09T13:48:26.046-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:48:27 launchpad ollama[1555]: time=2025-04-09T13:48:27.869-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 09 13:48:44 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 13:48:44 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 13:48:44 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 13:48:44 launchpad systemd[1]: ollama.service: Consumed 5.361s CPU time, 786.9M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 88dbd4a34cf44613955445ccd8d61888 --
+Apr 09 13:49:15 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 13:49:16 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 13:49:16 launchpad ollama[1545]: 2025/04/09 13:49:16 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 13:49:16 launchpad ollama[1545]: time=2025-04-09T13:49:16.148-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 13:49:16 launchpad ollama[1545]: time=2025-04-09T13:49:16.157-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 13:49:16 launchpad ollama[1545]: time=2025-04-09T13:49:16.158-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 13:49:16 launchpad ollama[1545]: time=2025-04-09T13:49:16.159-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2274272387/runners
+Apr 09 13:49:19 launchpad ollama[1545]: time=2025-04-09T13:49:19.193-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 13:49:19 launchpad ollama[1545]: time=2025-04-09T13:49:19.194-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 13:49:19 launchpad ollama[1545]: time=2025-04-09T13:49:19.194-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:49:19 launchpad ollama[1545]: time=2025-04-09T13:49:19.195-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:49:19 launchpad ollama[1545]: time=2025-04-09T13:49:19.195-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:49:19 launchpad ollama[1545]: time=2025-04-09T13:49:19.416-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 13:59:14 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 13:59:14 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 13:59:14 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 13:59:14 launchpad systemd[1]: ollama.service: Consumed 3.469s CPU time, 786.9M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot ced04c8de93a40778c928ba8b9f5c2f4 --
+Apr 09 13:59:46 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 13:59:46 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 13:59:46 launchpad ollama[1547]: 2025/04/09 13:59:46 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 13:59:46 launchpad ollama[1547]: time=2025-04-09T13:59:46.233-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 13:59:46 launchpad ollama[1547]: time=2025-04-09T13:59:46.244-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 13:59:46 launchpad ollama[1547]: time=2025-04-09T13:59:46.245-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 13:59:46 launchpad ollama[1547]: time=2025-04-09T13:59:46.246-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama800634150/runners
+Apr 09 13:59:49 launchpad ollama[1547]: time=2025-04-09T13:59:49.230-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 13:59:49 launchpad ollama[1547]: time=2025-04-09T13:59:49.230-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 13:59:49 launchpad ollama[1547]: time=2025-04-09T13:59:49.231-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:59:49 launchpad ollama[1547]: time=2025-04-09T13:59:49.231-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:59:49 launchpad ollama[1547]: time=2025-04-09T13:59:49.231-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:59:49 launchpad ollama[1547]: time=2025-04-09T13:59:49.450-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 14:15:11 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 14:15:11 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 14:15:11 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 14:15:11 launchpad systemd[1]: ollama.service: Consumed 3.424s CPU time, 786.6M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot fb67aec4a96c42b18e070001ac894f87 --
+Apr 09 14:15:42 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 14:15:42 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 14:15:42 launchpad ollama[1542]: 2025/04/09 14:15:42 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 14:15:42 launchpad ollama[1542]: time=2025-04-09T14:15:42.982-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 14:15:42 launchpad ollama[1542]: time=2025-04-09T14:15:42.990-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 14:15:42 launchpad ollama[1542]: time=2025-04-09T14:15:42.991-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 14:15:42 launchpad ollama[1542]: time=2025-04-09T14:15:42.992-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2499721513/runners
+Apr 09 14:15:45 launchpad ollama[1542]: time=2025-04-09T14:15:45.974-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 14:15:45 launchpad ollama[1542]: time=2025-04-09T14:15:45.975-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 14:15:45 launchpad ollama[1542]: time=2025-04-09T14:15:45.975-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 14:15:45 launchpad ollama[1542]: time=2025-04-09T14:15:45.975-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 14:15:45 launchpad ollama[1542]: time=2025-04-09T14:15:45.975-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 14:15:46 launchpad ollama[1542]: time=2025-04-09T14:15:46.213-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 10 10:06:49 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 10 10:06:49 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 10 10:06:49 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 10 10:06:49 launchpad systemd[1]: ollama.service: Consumed 4.209s CPU time, 786.7M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot e9957a4137514212bb6d6f281b5a62a2 --
+Apr 10 10:08:51 launchpad systemd[1]: Starting Server for local large language models...
+Apr 10 10:08:51 launchpad systemd[1]: Started Server for local large language models.
+Apr 10 10:08:51 launchpad ollama[1577]: 2025/04/10 10:08:51 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 10 10:08:51 launchpad ollama[1577]: time=2025-04-10T10:08:51.571-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 10 10:08:51 launchpad ollama[1577]: time=2025-04-10T10:08:51.581-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 10 10:08:51 launchpad ollama[1577]: time=2025-04-10T10:08:51.582-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 10 10:08:51 launchpad ollama[1577]: time=2025-04-10T10:08:51.584-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3501558433/runners
+Apr 10 10:08:54 launchpad ollama[1577]: time=2025-04-10T10:08:54.506-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 10 10:08:54 launchpad ollama[1577]: time=2025-04-10T10:08:54.507-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 10 10:08:54 launchpad ollama[1577]: time=2025-04-10T10:08:54.508-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 10:08:54 launchpad ollama[1577]: time=2025-04-10T10:08:54.508-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 10:08:54 launchpad ollama[1577]: time=2025-04-10T10:08:54.508-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 10:08:56 launchpad ollama[1577]: time=2025-04-10T10:08:56.220-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 10 10:22:44 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 10 10:22:45 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 10 10:22:45 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 10 10:22:45 launchpad systemd[1]: ollama.service: Consumed 5.248s CPU time, 786.9M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 4618c9f76f3e4a6ab487c026738f643a --
+Apr 10 10:23:20 launchpad systemd[1]: Starting Server for local large language models...
+Apr 10 10:23:20 launchpad systemd[1]: Started Server for local large language models.
+Apr 10 10:23:21 launchpad ollama[1475]: 2025/04/10 10:23:21 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 10 10:23:21 launchpad ollama[1475]: time=2025-04-10T10:23:21.359-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 10 10:23:21 launchpad ollama[1475]: time=2025-04-10T10:23:21.476-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 10 10:23:21 launchpad ollama[1475]: time=2025-04-10T10:23:21.484-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 10 10:23:21 launchpad ollama[1475]: time=2025-04-10T10:23:21.496-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1075261275/runners
+Apr 10 10:23:28 launchpad ollama[1475]: time=2025-04-10T10:23:28.634-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 10 10:23:28 launchpad ollama[1475]: time=2025-04-10T10:23:28.643-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 10 10:23:28 launchpad ollama[1475]: time=2025-04-10T10:23:28.643-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 10:23:28 launchpad ollama[1475]: time=2025-04-10T10:23:28.648-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 10:23:28 launchpad ollama[1475]: time=2025-04-10T10:23:28.648-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 10:23:29 launchpad ollama[1475]: time=2025-04-10T10:23:29.118-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 10 14:47:23 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 10 14:47:23 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 10 14:47:23 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 10 14:47:23 launchpad systemd[1]: ollama.service: Consumed 3.581s CPU time, 788M memory peak, 235.2M read from disk, 508.1M written to disk.
+-- Boot bf8f28edaff24d31aab33845041368de --
+Apr 10 14:48:03 launchpad systemd[1]: Starting Server for local large language models...
+Apr 10 14:48:03 launchpad systemd[1]: Started Server for local large language models.
+Apr 10 14:48:03 launchpad ollama[1765]: 2025/04/10 14:48:03 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 10 14:48:03 launchpad ollama[1765]: time=2025-04-10T14:48:03.302-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 10 14:48:03 launchpad ollama[1765]: time=2025-04-10T14:48:03.311-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 10 14:48:03 launchpad ollama[1765]: time=2025-04-10T14:48:03.312-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 10 14:48:03 launchpad ollama[1765]: time=2025-04-10T14:48:03.314-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2361362427/runners
+Apr 10 14:48:06 launchpad ollama[1765]: time=2025-04-10T14:48:06.393-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 10 14:48:06 launchpad ollama[1765]: time=2025-04-10T14:48:06.393-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 10 14:48:06 launchpad ollama[1765]: time=2025-04-10T14:48:06.394-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 14:48:06 launchpad ollama[1765]: time=2025-04-10T14:48:06.394-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 14:48:06 launchpad ollama[1765]: time=2025-04-10T14:48:06.394-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 14:48:06 launchpad ollama[1765]: time=2025-04-10T14:48:06.624-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 10 15:04:54 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 10 15:04:54 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 10 15:04:54 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 10 15:04:54 launchpad systemd[1]: ollama.service: Consumed 3.479s CPU time, 787.3M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 0cfbb8f023a7402faa52ab71aa7747c6 --
+Apr 10 15:05:26 launchpad systemd[1]: Starting Server for local large language models...
+Apr 10 15:05:26 launchpad systemd[1]: Started Server for local large language models.
+Apr 10 15:05:26 launchpad ollama[1767]: 2025/04/10 15:05:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 10 15:05:26 launchpad ollama[1767]: time=2025-04-10T15:05:26.441-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 10 15:05:26 launchpad ollama[1767]: time=2025-04-10T15:05:26.452-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 10 15:05:26 launchpad ollama[1767]: time=2025-04-10T15:05:26.453-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 10 15:05:26 launchpad ollama[1767]: time=2025-04-10T15:05:26.455-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama428823615/runners
+Apr 10 15:05:29 launchpad ollama[1767]: time=2025-04-10T15:05:29.442-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 10 15:05:29 launchpad ollama[1767]: time=2025-04-10T15:05:29.443-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 10 15:05:29 launchpad ollama[1767]: time=2025-04-10T15:05:29.443-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 15:05:29 launchpad ollama[1767]: time=2025-04-10T15:05:29.443-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 15:05:29 launchpad ollama[1767]: time=2025-04-10T15:05:29.443-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 15:05:29 launchpad ollama[1767]: time=2025-04-10T15:05:29.669-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 10 15:24:23 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 10 15:24:23 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 10 15:24:23 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 10 15:24:23 launchpad systemd[1]: ollama.service: Consumed 3.455s CPU time, 787.1M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot 7ccc3d496da5419f8ae6577671f300b0 --
+Apr 10 15:24:55 launchpad systemd[1]: Starting Server for local large language models...
+Apr 10 15:24:55 launchpad systemd[1]: Started Server for local large language models.
+Apr 10 15:24:56 launchpad ollama[1755]: 2025/04/10 15:24:56 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 10 15:24:56 launchpad ollama[1755]: time=2025-04-10T15:24:56.083-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 10 15:24:56 launchpad ollama[1755]: time=2025-04-10T15:24:56.095-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 10 15:24:56 launchpad ollama[1755]: time=2025-04-10T15:24:56.096-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 10 15:24:56 launchpad ollama[1755]: time=2025-04-10T15:24:56.097-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2840929771/runners
+Apr 10 15:24:59 launchpad ollama[1755]: time=2025-04-10T15:24:59.152-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 10 15:24:59 launchpad ollama[1755]: time=2025-04-10T15:24:59.153-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 10 15:24:59 launchpad ollama[1755]: time=2025-04-10T15:24:59.153-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 15:24:59 launchpad ollama[1755]: time=2025-04-10T15:24:59.153-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 15:24:59 launchpad ollama[1755]: time=2025-04-10T15:24:59.153-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 15:24:59 launchpad ollama[1755]: time=2025-04-10T15:24:59.365-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 10 16:02:18 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 10 16:02:18 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 10 16:02:19 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 10 16:02:19 launchpad systemd[1]: ollama.service: Consumed 3.495s CPU time, 787.9M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 7d0a614ca76f4931b080917f004526fd --
+Apr 10 16:03:00 launchpad systemd[1]: Starting Server for local large language models...
+Apr 10 16:03:00 launchpad systemd[1]: Started Server for local large language models.
+Apr 10 16:03:00 launchpad ollama[1757]: 2025/04/10 16:03:00 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 10 16:03:00 launchpad ollama[1757]: time=2025-04-10T16:03:00.874-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 10 16:03:00 launchpad ollama[1757]: time=2025-04-10T16:03:00.883-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 10 16:03:00 launchpad ollama[1757]: time=2025-04-10T16:03:00.884-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 10 16:03:00 launchpad ollama[1757]: time=2025-04-10T16:03:00.886-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4085095757/runners
+Apr 10 16:03:03 launchpad ollama[1757]: time=2025-04-10T16:03:03.865-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 10 16:03:03 launchpad ollama[1757]: time=2025-04-10T16:03:03.866-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 10 16:03:03 launchpad ollama[1757]: time=2025-04-10T16:03:03.866-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 16:03:03 launchpad ollama[1757]: time=2025-04-10T16:03:03.867-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 16:03:03 launchpad ollama[1757]: time=2025-04-10T16:03:03.867-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 16:03:04 launchpad ollama[1757]: time=2025-04-10T16:03:04.089-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 12 11:40:29 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 12 11:40:29 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 12 11:40:29 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 12 11:40:29 launchpad systemd[1]: ollama.service: Consumed 3.777s CPU time, 787.5M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot f5f3826f76534b1db33c34162bde23a7 --
+Apr 12 11:41:10 launchpad systemd[1]: Starting Server for local large language models...
+Apr 12 11:41:10 launchpad systemd[1]: Started Server for local large language models.
+Apr 12 11:41:11 launchpad ollama[1751]: 2025/04/12 11:41:11 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 12 11:41:11 launchpad ollama[1751]: time=2025-04-12T11:41:11.064-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 12 11:41:11 launchpad ollama[1751]: time=2025-04-12T11:41:11.071-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 12 11:41:11 launchpad ollama[1751]: time=2025-04-12T11:41:11.072-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 12 11:41:11 launchpad ollama[1751]: time=2025-04-12T11:41:11.073-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama702236440/runners
+Apr 12 11:41:14 launchpad ollama[1751]: time=2025-04-12T11:41:14.044-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 12 11:41:14 launchpad ollama[1751]: time=2025-04-12T11:41:14.045-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 12 11:41:14 launchpad ollama[1751]: time=2025-04-12T11:41:14.045-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 11:41:14 launchpad ollama[1751]: time=2025-04-12T11:41:14.046-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 11:41:14 launchpad ollama[1751]: time=2025-04-12T11:41:14.046-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 11:41:14 launchpad ollama[1751]: time=2025-04-12T11:41:14.283-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 12 15:50:45 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:50:45 | 200 |    3.751256ms |       127.0.0.1 | GET      "/api/tags"
+Apr 12 15:50:46 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:50:46 | 200 |     768.857µs |       127.0.0.1 | GET      "/api/tags"
+Apr 12 15:50:46 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:50:46 | 200 |     586.742µs |       127.0.0.1 | GET      "/api/version"
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.377-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9205645312 required="6.2 GiB"
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.377-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.378-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.380-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43409"
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.380-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.380-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.381-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 15:51:59 launchpad ollama[89512]: INFO [main] build info | build=0 commit="unknown" tid="139813475696640" timestamp=1744498319
+Apr 12 15:51:59 launchpad ollama[89512]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139813475696640" timestamp=1744498319 total_threads=16
+Apr 12 15:51:59 launchpad ollama[89512]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43409" tid="139813475696640" timestamp=1744498319
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - type  f32:   65 tensors
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - type q4_0:  225 tensors
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - type q6_K:    1 tensors
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.631-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 15:51:59 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 15:51:59 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 15:51:59 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 15:51:59 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 15:52:04 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 15:52:04 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 15:52:04 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 15:52:04 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 15:52:04 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 15:52:05 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 15:52:05 launchpad ollama[89512]: INFO [main] model loaded | tid="139813475696640" timestamp=1744498325
+Apr 12 15:52:05 launchpad ollama[1751]: time=2025-04-12T15:52:05.647-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Apr 12 15:52:05 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:52:05 | 200 |  6.627753307s |       127.0.0.1 | POST     "/api/embed"
+Apr 12 15:52:05 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:52:05 | 200 |  168.769922ms |       127.0.0.1 | POST     "/api/embed"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.187-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.872-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9205514240 required="6.5 GiB"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.872-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.872-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.873-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37237"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.873-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.873-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.873-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 15:52:06 launchpad ollama[89549]: INFO [main] build info | build=0 commit="unknown" tid="140648581529600" timestamp=1744498326
+Apr 12 15:52:06 launchpad ollama[89549]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140648581529600" timestamp=1744498326 total_threads=16
+Apr 12 15:52:06 launchpad ollama[89549]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37237" tid="140648581529600" timestamp=1744498326
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - type  f32:   66 tensors
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - type q4_K:  193 tensors
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - type q6_K:   33 tensors
+Apr 12 15:52:07 launchpad ollama[1751]: time=2025-04-12T15:52:07.124-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 15:52:07 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 15:52:07 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 15:52:07 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 15:52:07 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 15:52:11 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 15:52:11 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 15:52:11 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 15:52:11 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 15:52:11 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 15:52:12 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 15:52:12 launchpad ollama[89549]: INFO [main] model loaded | tid="140648581529600" timestamp=1744498332
+Apr 12 15:52:12 launchpad ollama[1751]: time=2025-04-12T15:52:12.388-07:00 level=INFO source=server.go:626 msg="llama runner started in 5.51 seconds"
+Apr 12 15:52:31 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:52:31 | 200 | 25.176674753s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 15:52:33 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:52:33 | 200 |  1.957238926s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 15:52:38 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:52:38 | 200 |  5.699314963s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 15:56:43 launchpad ollama[1751]: time=2025-04-12T15:56:43.701-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.395-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9218686976 required="6.2 GiB"
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.395-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.396-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.397-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33515"
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.397-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.397-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.397-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 15:56:44 launchpad ollama[90712]: INFO [main] build info | build=0 commit="unknown" tid="139842239279104" timestamp=1744498604
+Apr 12 15:56:44 launchpad ollama[90712]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139842239279104" timestamp=1744498604 total_threads=16
+Apr 12 15:56:44 launchpad ollama[90712]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33515" tid="139842239279104" timestamp=1744498604
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - type  f32:   65 tensors
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - type q4_0:  225 tensors
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - type q6_K:    1 tensors
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.649-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 15:56:44 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 15:56:44 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 15:56:44 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 15:56:44 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 15:56:45 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 15:56:45 launchpad ollama[90712]: INFO [main] model loaded | tid="139842239279104" timestamp=1744498605
+Apr 12 15:56:45 launchpad ollama[1751]: time=2025-04-12T15:56:45.652-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 12 15:56:45 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:56:45 | 200 |  2.323122961s |       127.0.0.1 | POST     "/api/embed"
+Apr 12 15:56:46 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:56:46 | 200 |  190.725431ms |       127.0.0.1 | POST     "/api/embed"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.233-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.957-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9218228224 required="6.5 GiB"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.957-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.957-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.958-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43113"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.958-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.958-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.958-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 15:56:46 launchpad ollama[90780]: INFO [main] build info | build=0 commit="unknown" tid="140338040655872" timestamp=1744498606
+Apr 12 15:56:46 launchpad ollama[90780]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140338040655872" timestamp=1744498606 total_threads=16
+Apr 12 15:56:46 launchpad ollama[90780]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43113" tid="140338040655872" timestamp=1744498606
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - type  f32:   66 tensors
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - type q4_K:  193 tensors
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - type q6_K:   33 tensors
+Apr 12 15:56:47 launchpad ollama[1751]: time=2025-04-12T15:56:47.209-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 15:56:47 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 15:56:47 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 15:56:47 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 15:56:47 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 15:56:47 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 15:56:48 launchpad ollama[90780]: INFO [main] model loaded | tid="140338040655872" timestamp=1744498608
+Apr 12 15:56:48 launchpad ollama[1751]: time=2025-04-12T15:56:48.213-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 15:57:06 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:57:06 | 200 | 20.745649127s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 15:57:26 launchpad ollama[1751]: time=2025-04-12T15:57:26.810-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.506-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9215475712 required="6.2 GiB"
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.506-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.506-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.507-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34397"
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.507-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.507-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.507-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 15:57:27 launchpad ollama[91152]: INFO [main] build info | build=0 commit="unknown" tid="140132836282368" timestamp=1744498647
+Apr 12 15:57:27 launchpad ollama[91152]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140132836282368" timestamp=1744498647 total_threads=16
+Apr 12 15:57:27 launchpad ollama[91152]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34397" tid="140132836282368" timestamp=1744498647
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - type  f32:   65 tensors
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - type q4_0:  225 tensors
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - type q6_K:    1 tensors
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.758-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 15:57:27 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 15:57:27 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 15:57:27 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 15:57:27 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 15:57:28 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 15:57:28 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 15:57:28 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 15:57:28 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 15:57:28 launchpad ollama[91152]: INFO [main] model loaded | tid="140132836282368" timestamp=1744498648
+Apr 12 15:57:28 launchpad ollama[1751]: time=2025-04-12T15:57:28.761-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 15:57:28 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:57:28 | 200 |   2.30868954s |       127.0.0.1 | POST     "/api/embed"
+Apr 12 15:57:29 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:57:29 | 200 |  173.378365ms |       127.0.0.1 | POST     "/api/embed"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.307-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.989-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9215475712 required="6.5 GiB"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.989-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.989-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.991-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33589"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.991-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.991-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.991-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 15:57:30 launchpad ollama[91183]: INFO [main] build info | build=0 commit="unknown" tid="139925061128192" timestamp=1744498650
+Apr 12 15:57:30 launchpad ollama[91183]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139925061128192" timestamp=1744498650 total_threads=16
+Apr 12 15:57:30 launchpad ollama[91183]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33589" tid="139925061128192" timestamp=1744498650
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - type  f32:   66 tensors
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - type q4_K:  193 tensors
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - type q6_K:   33 tensors
+Apr 12 15:57:30 launchpad ollama[1751]: time=2025-04-12T15:57:30.242-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 15:57:30 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 15:57:30 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 15:57:30 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 15:57:30 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 15:57:31 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 15:57:31 launchpad ollama[91183]: INFO [main] model loaded | tid="139925061128192" timestamp=1744498651
+Apr 12 15:57:31 launchpad ollama[1751]: time=2025-04-12T15:57:31.245-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 15:57:44 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:57:44 | 200 | 14.992532084s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 16:01:00 launchpad ollama[1751]: time=2025-04-12T16:01:00.343-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.5 GiB"
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.033-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9211412480 required="6.2 GiB"
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.033-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.033-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.035-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34717"
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.035-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.035-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.035-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 16:01:01 launchpad ollama[92523]: INFO [main] build info | build=0 commit="unknown" tid="140591286628352" timestamp=1744498861
+Apr 12 16:01:01 launchpad ollama[92523]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140591286628352" timestamp=1744498861 total_threads=16
+Apr 12 16:01:01 launchpad ollama[92523]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34717" tid="140591286628352" timestamp=1744498861
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - type  f32:   65 tensors
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - type q4_0:  225 tensors
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - type q6_K:    1 tensors
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.286-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 16:01:01 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 16:01:01 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 16:01:01 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 16:01:01 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 16:01:02 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 16:01:02 launchpad ollama[92523]: INFO [main] model loaded | tid="140591286628352" timestamp=1744498862
+Apr 12 16:01:02 launchpad ollama[1751]: time=2025-04-12T16:01:02.290-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 16:01:02 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:01:02 | 200 |  2.341767936s |       127.0.0.1 | POST     "/api/embed"
+Apr 12 16:01:02 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:01:02 | 200 |  210.156647ms |       127.0.0.1 | POST     "/api/embed"
+Apr 12 16:01:02 launchpad ollama[1751]: time=2025-04-12T16:01:02.938-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.632-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9212329984 required="6.5 GiB"
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.632-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.633-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.634-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37551"
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.634-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.634-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.634-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 16:01:03 launchpad ollama[92554]: INFO [main] build info | build=0 commit="unknown" tid="140050573516800" timestamp=1744498863
+Apr 12 16:01:03 launchpad ollama[92554]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140050573516800" timestamp=1744498863 total_threads=16
+Apr 12 16:01:03 launchpad ollama[92554]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37551" tid="140050573516800" timestamp=1744498863
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - type  f32:   66 tensors
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - type q4_K:  193 tensors
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - type q6_K:   33 tensors
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.885-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 16:01:03 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 16:01:03 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 16:01:03 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 16:01:03 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 16:01:04 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 16:01:04 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 16:01:04 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 16:01:04 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 16:01:04 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 16:01:04 launchpad ollama[92554]: INFO [main] model loaded | tid="140050573516800" timestamp=1744498864
+Apr 12 16:01:04 launchpad ollama[1751]: time=2025-04-12T16:01:04.888-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 16:01:30 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:01:30 | 200 | 27.372628557s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 16:03:01 launchpad ollama[1751]: time=2025-04-12T16:03:01.429-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.5 GiB"
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.141-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9212461056 required="6.2 GiB"
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.141-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.141-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.143-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36733"
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.143-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.143-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.143-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 16:03:02 launchpad ollama[93031]: INFO [main] build info | build=0 commit="unknown" tid="140123403542528" timestamp=1744498982
+Apr 12 16:03:02 launchpad ollama[93031]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140123403542528" timestamp=1744498982 total_threads=16
+Apr 12 16:03:02 launchpad ollama[93031]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36733" tid="140123403542528" timestamp=1744498982
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - type  f32:   65 tensors
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - type q4_0:  225 tensors
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - type q6_K:    1 tensors
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.394-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 16:03:02 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 16:03:02 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 16:03:02 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 16:03:02 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 16:03:03 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 16:03:03 launchpad ollama[93031]: INFO [main] model loaded | tid="140123403542528" timestamp=1744498983
+Apr 12 16:03:03 launchpad ollama[1751]: time=2025-04-12T16:03:03.397-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 16:03:03 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:03:03 | 200 |  2.323360326s |       127.0.0.1 | POST     "/api/embed"
+Apr 12 16:03:03 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:03:03 | 200 |  168.583748ms |       127.0.0.1 | POST     "/api/embed"
+Apr 12 16:03:03 launchpad ollama[1751]: time=2025-04-12T16:03:03.941-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.629-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9212526592 required="6.5 GiB"
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.629-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.629-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.630-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43341"
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.630-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.631-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.631-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 16:03:04 launchpad ollama[93062]: INFO [main] build info | build=0 commit="unknown" tid="139945172643840" timestamp=1744498984
+Apr 12 16:03:04 launchpad ollama[93062]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139945172643840" timestamp=1744498984 total_threads=16
+Apr 12 16:03:04 launchpad ollama[93062]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43341" tid="139945172643840" timestamp=1744498984
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - type  f32:   66 tensors
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - type q4_K:  193 tensors
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - type q6_K:   33 tensors
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.882-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 16:03:04 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 16:03:04 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 16:03:04 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 16:03:04 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 16:03:05 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 16:03:05 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 16:03:05 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 16:03:05 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 16:03:05 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 16:03:05 launchpad ollama[93062]: INFO [main] model loaded | tid="139945172643840" timestamp=1744498985
+Apr 12 16:03:05 launchpad ollama[1751]: time=2025-04-12T16:03:05.886-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 12 16:03:27 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:03:27 | 200 | 23.565740742s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 16:04:14 launchpad ollama[1751]: time=2025-04-12T16:04:14.949-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.5 GiB"
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.646-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9212788736 required="6.2 GiB"
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.646-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.647-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.648-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39819"
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.648-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.648-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.649-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 16:04:15 launchpad ollama[93352]: INFO [main] build info | build=0 commit="unknown" tid="140146931527680" timestamp=1744499055
+Apr 12 16:04:15 launchpad ollama[93352]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140146931527680" timestamp=1744499055 total_threads=16
+Apr 12 16:04:15 launchpad ollama[93352]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39819" tid="140146931527680" timestamp=1744499055
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - type  f32:   65 tensors
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - type q4_0:  225 tensors
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - type q6_K:    1 tensors
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.900-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 16:04:15 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 16:04:15 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 16:04:15 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 16:04:15 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 16:04:16 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 16:04:16 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 16:04:16 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 16:04:16 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 16:04:16 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 16:04:16 launchpad ollama[93352]: INFO [main] model loaded | tid="140146931527680" timestamp=1744499056
+Apr 12 16:04:16 launchpad ollama[1751]: time=2025-04-12T16:04:16.903-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 16:04:17 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:04:17 | 200 |  2.305077355s |       127.0.0.1 | POST     "/api/embed"
+Apr 12 16:04:17 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:04:17 | 200 |  169.492759ms |       127.0.0.1 | POST     "/api/embed"
+Apr 12 16:04:17 launchpad ollama[1751]: time=2025-04-12T16:04:17.444-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.145-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9212592128 required="6.5 GiB"
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.145-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.145-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.146-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42925"
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.146-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.146-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.146-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 16:04:18 launchpad ollama[93382]: INFO [main] build info | build=0 commit="unknown" tid="140604163182592" timestamp=1744499058
+Apr 12 16:04:18 launchpad ollama[93382]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140604163182592" timestamp=1744499058 total_threads=16
+Apr 12 16:04:18 launchpad ollama[93382]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42925" tid="140604163182592" timestamp=1744499058
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - type  f32:   66 tensors
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - type q4_K:  193 tensors
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - type q6_K:   33 tensors
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.397-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 16:04:18 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 16:04:18 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 16:04:18 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 16:04:18 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 16:04:19 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 16:04:19 launchpad ollama[93382]: INFO [main] model loaded | tid="140604163182592" timestamp=1744499059
+Apr 12 16:04:19 launchpad ollama[1751]: time=2025-04-12T16:04:19.401-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 16:04:32 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:04:32 | 200 | 15.566448776s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 18:34:49 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 12 18:34:50 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 12 18:34:50 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 12 18:34:50 launchpad systemd[1]: ollama.service: Consumed 2min 25.493s CPU time, 10.2G memory peak, 9.2G read from disk, 508.1M written to disk.
+-- Boot b745703e6a8e420d9ac385fadf7e88b8 --
+Apr 12 18:35:30 launchpad systemd[1]: Starting Server for local large language models...
+Apr 12 18:35:30 launchpad systemd[1]: Started Server for local large language models.
+Apr 12 18:35:31 launchpad ollama[1752]: 2025/04/12 18:35:31 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 12 18:35:31 launchpad ollama[1752]: time=2025-04-12T18:35:31.028-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 12 18:35:31 launchpad ollama[1752]: time=2025-04-12T18:35:31.036-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 12 18:35:31 launchpad ollama[1752]: time=2025-04-12T18:35:31.038-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 12 18:35:31 launchpad ollama[1752]: time=2025-04-12T18:35:31.039-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4229267965/runners
+Apr 12 18:35:34 launchpad ollama[1752]: time=2025-04-12T18:35:34.013-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 12 18:35:34 launchpad ollama[1752]: time=2025-04-12T18:35:34.014-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 12 18:35:34 launchpad ollama[1752]: time=2025-04-12T18:35:34.014-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 18:35:34 launchpad ollama[1752]: time=2025-04-12T18:35:34.014-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 18:35:34 launchpad ollama[1752]: time=2025-04-12T18:35:34.014-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 18:35:34 launchpad ollama[1752]: time=2025-04-12T18:35:34.248-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 12 19:07:02 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 12 19:07:02 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 12 19:07:02 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 12 19:07:02 launchpad systemd[1]: ollama.service: Consumed 3.434s CPU time, 786.5M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 2596e88fae2040f4923249aafb7e7712 --
+Apr 12 19:07:43 launchpad systemd[1]: Starting Server for local large language models...
+Apr 12 19:07:44 launchpad systemd[1]: Started Server for local large language models.
+Apr 12 19:07:44 launchpad ollama[1757]: 2025/04/12 19:07:44 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 12 19:07:44 launchpad ollama[1757]: time=2025-04-12T19:07:44.139-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 12 19:07:44 launchpad ollama[1757]: time=2025-04-12T19:07:44.149-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 12 19:07:44 launchpad ollama[1757]: time=2025-04-12T19:07:44.150-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 12 19:07:44 launchpad ollama[1757]: time=2025-04-12T19:07:44.152-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama260338899/runners
+Apr 12 19:07:47 launchpad ollama[1757]: time=2025-04-12T19:07:47.186-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 12 19:07:47 launchpad ollama[1757]: time=2025-04-12T19:07:47.186-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 12 19:07:47 launchpad ollama[1757]: time=2025-04-12T19:07:47.186-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 19:07:47 launchpad ollama[1757]: time=2025-04-12T19:07:47.187-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 19:07:47 launchpad ollama[1757]: time=2025-04-12T19:07:47.187-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 19:07:47 launchpad ollama[1757]: time=2025-04-12T19:07:47.402-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 11:41:04 launchpad ollama[1757]: [GIN] 2025/04/13 - 11:41:04 | 200 |    3.878869ms |       127.0.0.1 | GET      "/api/tags"
+Apr 13 11:41:04 launchpad ollama[1757]: [GIN] 2025/04/13 - 11:41:04 | 200 |     634.032µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 11:41:04 launchpad ollama[1757]: [GIN] 2025/04/13 - 11:41:04 | 200 |     589.933µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 11:52:35 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 11:52:35 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 11:52:35 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 11:52:35 launchpad systemd[1]: ollama.service: Consumed 3.511s CPU time, 787.4M memory peak, 234.7M read from disk, 508.1M written to disk.
+-- Boot 40317137f26d4efc9c418a400690a2a8 --
+Apr 13 11:53:16 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 11:53:16 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 11:53:16 launchpad ollama[1754]: 2025/04/13 11:53:16 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 11:53:16 launchpad ollama[1754]: time=2025-04-13T11:53:16.436-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 11:53:16 launchpad ollama[1754]: time=2025-04-13T11:53:16.445-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 11:53:16 launchpad ollama[1754]: time=2025-04-13T11:53:16.446-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 11:53:16 launchpad ollama[1754]: time=2025-04-13T11:53:16.447-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1506917716/runners
+Apr 13 11:53:19 launchpad ollama[1754]: time=2025-04-13T11:53:19.475-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 13 11:53:19 launchpad ollama[1754]: time=2025-04-13T11:53:19.476-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 11:53:19 launchpad ollama[1754]: time=2025-04-13T11:53:19.476-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 11:53:19 launchpad ollama[1754]: time=2025-04-13T11:53:19.476-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 11:53:19 launchpad ollama[1754]: time=2025-04-13T11:53:19.476-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 11:53:19 launchpad ollama[1754]: time=2025-04-13T11:53:19.697-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 12:15:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:15:00 | 200 |    3.728711ms |       127.0.0.1 | GET      "/api/tags"
+Apr 13 12:15:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:15:00 | 200 |     620.216µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 12:15:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:15:00 | 200 |      581.55µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 12:18:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:18:38 | 200 |     576.644µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 12:18:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:18:38 | 200 |       26.46µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.978-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10194780160 required="6.2 GiB"
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.978-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.979-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.980-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39129"
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.980-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.980-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.981-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 12:22:05 launchpad ollama[11080]: INFO [main] build info | build=0 commit="unknown" tid="139944833957888" timestamp=1744572125
+Apr 13 12:22:05 launchpad ollama[11080]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139944833957888" timestamp=1744572125 total_threads=16
+Apr 13 12:22:05 launchpad ollama[11080]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39129" tid="139944833957888" timestamp=1744572125
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 12:22:05 launchpad ollama[1754]: time=2025-04-13T12:22:05.231-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 12:22:05 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 12:22:05 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 12:22:05 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 12:22:05 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 12:22:10 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 12:22:10 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 12:22:10 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 12:22:10 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 12:22:10 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 12:22:10 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 12:22:11 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 12:22:11 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 12:22:11 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 12:22:11 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 12:22:11 launchpad ollama[11080]: INFO [main] model loaded | tid="139944833957888" timestamp=1744572131
+Apr 13 12:22:11 launchpad ollama[1754]: time=2025-04-13T12:22:11.250-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Apr 13 12:22:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:11 | 200 |  6.620345601s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:11 | 200 |  172.120741ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:12 | 200 |   166.67769ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:12 | 200 |  168.137847ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:14 | 200 |  2.579237733s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:17 | 200 |  2.499719065s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:18 | 200 |  735.220817ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:19 | 200 |  684.265221ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:23 | 200 |  4.246424967s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:27 | 200 |    4.3554732s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:29 | 200 |  1.290874466s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:30 | 200 |  1.287759135s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:31 | 200 |  708.163748ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:32 | 200 |  658.403673ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:32 | 200 |   94.787215ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:32 | 200 |   174.68155ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:38 | 200 |  5.692742201s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:43 | 200 |  5.635757698s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:49 | 200 |  5.697265074s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:56 | 200 |  7.068241853s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:00 | 200 |  3.788666964s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:06 | 200 |  5.622250292s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:12 | 200 |  5.676904881s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:18 | 200 |  5.740432975s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:25 | 200 |  7.038507573s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:28 | 200 |  3.788416995s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:29 | 200 |  135.694108ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:29 | 200 |  133.483345ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:29 | 200 |  125.359869ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:29 | 200 |  128.281804ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:30 | 200 |  133.415387ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:37 | 200 |  7.421019595s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:44 | 200 |  7.090316178s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:52 | 200 |  7.408047169s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:59 | 200 |  7.137196195s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:59 | 200 |   87.397788ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:24:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:24:00 | 200 |   229.93963ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:24:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:24:00 | 200 |  227.346569ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:04 | 200 |  1.002033433s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:05 | 200 |  899.358758ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:06 | 200 |  808.103486ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:07 | 200 |  800.237358ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:12 | 200 |  507.578689ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:12 | 200 |  458.547326ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:18 | 200 |  180.118115ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:18 | 200 |  183.429445ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:29 | 200 |  2.176445677s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:31 | 200 |  2.136366913s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:40 | 200 |   2.09451467s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:42 | 200 |  2.054908681s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:43 | 200 |   461.03547ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:43 | 200 |  445.967572ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:43 | 200 |  123.176326ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:44 | 200 |  203.441727ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:44 | 200 |  261.427921ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:44 | 200 |  256.136666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:00 | 200 |  198.409698ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:00 | 200 |  192.951944ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:00 | 200 |  273.013556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:01 | 200 |  255.900819ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:01 | 200 |  262.591282ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:01 | 200 |  257.439468ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:02 | 200 |  300.355752ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:02 | 200 |  290.196551ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:02 | 200 |  203.460911ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:02 | 200 |  202.855722ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:03 | 200 |  173.540775ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:03 | 200 |  174.549117ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:03 | 200 |  263.094569ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:04 | 200 |  259.544499ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:04 | 200 |  169.070722ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:04 | 200 |  169.492949ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:04 | 200 |  264.753356ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:05 | 200 |  258.151333ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:38 | 200 |  4.578377588s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:42 | 200 |  4.315993147s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:44 | 200 |  1.834749955s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:49 | 200 |  4.446968594s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:53 | 200 |  4.252742931s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:55 | 200 |  1.799616519s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:56 | 200 |  433.214643ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:56 | 200 |   458.75978ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:56 | 200 |  176.232613ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:57 | 200 |  213.143787ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:57 | 200 |  109.318915ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:57 | 200 |  150.767635ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:58 | 200 |  432.736027ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:58 | 200 |  321.922819ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:59 | 200 |  820.851375ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:00 | 200 |  840.302343ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:00 | 200 |  382.753486ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:01 | 200 |  396.209504ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:01 | 200 |  139.073883ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:01 | 200 |  137.244626ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:02 | 200 |  817.011491ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:03 | 200 |  753.764256ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:03 | 200 |  130.102094ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:03 | 200 |  210.247266ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:04 | 200 |  710.108091ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:05 | 200 |  622.104944ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:05 | 200 |  257.013709ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:05 | 200 |  247.948671ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:05 | 200 |  220.030658ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:06 | 200 |  217.357866ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:20 | 200 |  301.292935ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:21 | 200 |  291.086893ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:46 | 200 |  911.630883ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:47 | 200 |  774.311509ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:17 | 200 |  291.385876ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:18 | 200 |  289.243518ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:25 | 200 |  224.914551ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:26 | 200 |  231.370866ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:26 | 200 |  230.858649ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:26 | 200 |  223.030612ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:27 | 200 |   896.39329ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:28 | 200 |   798.67082ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:30 | 200 |  1.581948538s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:31 | 200 |   1.60936706s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:37 | 200 |  5.032478053s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:38 | 200 |  1.463793265s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:43 | 200 |  5.225126273s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:44 | 200 |  1.716382735s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:44 | 200 |  252.559362ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:45 | 200 |  1.566418451s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:46 | 200 |  356.504455ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:46 | 200 |  363.738348ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:46 | 200 |  107.084681ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:46 | 200 |  147.478833ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:58 | 200 |  222.089203ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:58 | 200 |  244.987296ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:05 | 200 |  507.131907ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:05 | 200 |  303.072736ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:05 | 200 |   96.926176ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:06 | 200 |  178.149413ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:09 | 200 |  183.990177ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:10 | 200 |  191.941038ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:18 | 200 |  327.150573ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:18 | 200 |  332.403537ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:19 | 200 |  420.449719ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:19 | 200 |   326.71694ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:20 | 200 |  623.232714ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:20 | 200 |  565.881072ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:21 | 200 |  423.253341ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:21 | 200 |   393.98183ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:22 | 200 |  563.263425ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:23 | 200 |  556.576111ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:23 | 200 |  387.783433ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:23 | 200 |  291.210528ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:30 | 200 |  212.655474ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:31 | 200 |  212.689132ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:31 | 200 |  229.603477ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:31 | 200 |  222.112378ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:46 | 200 |   4.02570548s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:50 | 200 |   4.08182804s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:15 | 200 |  188.784869ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:15 | 200 |  184.990732ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:16 | 200 |  560.925517ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:16 | 200 |  397.926064ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:26 | 200 |  319.963639ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:27 | 200 |  319.499331ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:31 | 200 |  370.174129ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:31 | 200 |  338.138337ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:59 | 200 |  1.990467673s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:32:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:32:01 | 200 |  1.750887892s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:32:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:32:01 | 200 |   581.70396ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:32:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:32:02 | 200 |  524.434263ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.750-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9575792640 required="6.2 GiB"
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.750-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.8 GiB" free_swap="68.9 GiB"
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.751-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.752-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42997"
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.752-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.752-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.752-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 13:23:30 launchpad ollama[25157]: INFO [main] build info | build=0 commit="unknown" tid="140401670529024" timestamp=1744575810
+Apr 13 13:23:30 launchpad ollama[25157]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140401670529024" timestamp=1744575810 total_threads=16
+Apr 13 13:23:30 launchpad ollama[25157]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42997" tid="140401670529024" timestamp=1744575810
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 13:23:31 launchpad ollama[1754]: time=2025-04-13T13:23:31.003-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 13:23:31 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 13:23:31 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 13:23:31 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 13:23:31 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 13:23:31 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 13:23:31 launchpad ollama[25157]: INFO [main] model loaded | tid="140401670529024" timestamp=1744575811
+Apr 13 13:23:32 launchpad ollama[1754]: time=2025-04-13T13:23:32.007-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 13:23:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:23:35 | 200 |  5.415993131s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:23:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:23:40 | 200 |  4.062804663s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:23:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:23:45 | 200 |  5.196035713s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:23:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:23:50 | 200 |  4.921320163s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:23:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:23:55 | 200 |  4.698829369s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:00 | 200 |  5.496283443s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:05 | 200 |  4.915490411s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:10 | 200 |  4.922804777s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:16 | 200 |   5.37305929s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:21 | 200 |  4.922397261s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:26 | 200 |  4.797622009s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:31 | 200 |  5.552054579s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:36 | 200 |  4.902656616s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:41 | 200 |  4.866707608s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:48 | 200 |  5.545103722s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:53 | 200 |  5.094984698s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:58 | 200 |  4.939207909s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:04 | 200 |  5.649536123s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:09 | 200 |   4.71763893s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:14 | 200 |  5.182880558s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:20 | 200 |  5.840619663s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:26 | 200 |  5.745745863s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:30 | 200 |  4.924080051s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:36 | 200 |  5.067641379s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:40 | 200 |  4.325283141s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:45 | 200 |  4.519983799s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:49 | 200 |  4.658572694s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:54 | 200 |   5.17862032s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:00 | 200 |  5.570241801s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:02 | 200 |     593.805µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 13:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:02 | 200 |      23.853µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 13:26:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:06 | 200 |  5.471502071s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:11 | 200 |  5.872309744s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:17 | 200 |  5.286808418s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:23 | 200 |  6.135609698s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:27 | 200 |  4.372406392s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:33 | 200 |  5.131204286s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:37 | 200 |  4.268732371s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:41 | 200 |  4.477129755s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:46 | 200 |  4.496509701s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:50 | 200 |  4.488880857s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:55 | 200 |  5.020606129s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:01 | 200 |  5.325673895s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:06 | 200 |  4.815120267s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:10 | 200 |  4.495471187s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:16 | 200 |  5.461260633s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:21 | 200 |  5.702900588s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:28 | 200 |  6.186560401s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:33 | 200 |   5.63639189s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:39 | 200 |  5.631363555s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:45 | 200 |  6.003421144s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:51 | 200 |  5.524773823s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:56 | 200 |  4.964827112s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:02 | 200 |  6.005929633s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:07 | 200 |   5.72471856s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:13 | 200 |  5.769708419s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:19 | 200 |  6.137424034s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:26 | 200 |  6.133989594s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:32 | 200 |  6.510034616s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:38 | 200 |  5.754626249s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:43 | 200 |  5.260813264s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:49 | 200 |  5.682071603s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:55 | 200 |  5.567776363s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:00 | 200 |  5.158658795s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:06 | 200 |  5.638567621s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:12 | 200 |  6.349813088s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:17 | 200 |  4.757526294s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:22 | 200 |  5.546029631s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:28 | 200 |  5.197849775s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:33 | 200 |  5.500846095s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:39 | 200 |  5.654043011s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:45 | 200 |  5.721036589s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:50 | 200 |  5.064005236s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:55 | 200 |  5.816013542s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:01 | 200 |  5.668873855s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:07 | 200 |  5.660936918s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:13 | 200 |  5.837027525s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:19 | 200 |  5.895537466s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:25 | 200 |  6.482066041s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:31 | 200 |  5.836212534s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:37 | 200 |   5.76839687s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:43 | 200 |  6.209691243s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:50 | 200 |  6.437179005s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:56 | 200 |  6.246523974s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:02 | 200 |  5.639837003s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:06 | 200 |  4.798913797s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:12 | 200 |  5.975174644s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:18 | 200 |  5.606964754s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:24 | 200 |  5.813943653s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:30 | 200 |  6.250468553s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:35 | 200 |  5.233499699s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:42 | 200 |  5.975729359s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:48 | 200 |  6.100053273s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:54 | 200 |  6.231265195s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:59 | 200 |  5.522625483s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:06 | 200 |   6.05436449s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:12 | 200 |  6.272603497s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:18 | 200 |  5.881001648s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:24 | 200 |  6.275035141s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:30 | 200 |  5.452967453s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:36 | 200 |  5.942843035s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:41 | 200 |  5.677462949s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:47 | 200 |  5.878414618s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:53 | 200 |  5.845273746s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:59 | 200 |  5.909972271s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:05 | 200 |  6.136041957s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:12 | 200 |  6.279992125s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:18 | 200 |  6.044054175s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:24 | 200 |  5.920208613s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:30 | 200 |  5.989413926s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:36 | 200 |  6.188999141s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:41 | 200 |  5.115428148s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:46 | 200 |  5.405639943s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:52 | 200 |  5.584974571s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:58 | 200 |  5.833793973s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:03 | 200 |  5.491157713s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:10 | 200 |  5.987821542s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:16 | 200 |  6.057862445s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:21 | 200 |  5.777213823s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:27 | 200 |  5.795979087s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:33 | 200 |  6.190637203s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:39 | 200 |  5.786517686s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:46 | 200 |  6.210268898s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:52 | 200 |  5.931000203s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:57 | 200 |  5.500044322s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:03 | 200 |  5.621318681s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:08 | 200 |  5.697127457s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:14 | 200 |  5.808690627s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:20 | 200 |  5.555098883s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:26 | 200 |   6.01157704s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:32 | 200 |  5.501388107s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:37 | 200 |   5.62617979s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:43 | 200 |  5.987086515s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:49 | 200 |  5.732456176s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:54 | 200 |  5.432410098s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:00 | 200 |  5.077717101s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:05 | 200 |  4.982458955s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:11 | 200 |    6.1086728s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:17 | 200 |  6.086346691s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:23 | 200 |  5.730260276s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:28 | 200 |  5.582906284s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:34 | 200 |  5.733756533s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:39 | 200 |  5.165732539s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:45 | 200 |  5.479445315s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:51 | 200 |  5.779007014s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:56 | 200 |  5.477441837s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:00 | 200 |  4.349689258s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:06 | 200 |  5.463534221s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:12 | 200 |  5.708383993s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:17 | 200 |  5.335859063s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:23 | 200 |  5.971078057s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:29 | 200 |  5.604126405s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:34 | 200 |  5.648820459s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:40 | 200 |  5.255048549s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:45 | 200 |  5.707188463s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:51 | 200 |  5.477529583s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:57 | 200 |  6.053374263s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:03 | 200 |  5.513459723s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:08 | 200 |  5.493355872s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:14 | 200 |  5.347172054s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:19 | 200 |  5.752443182s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:25 | 200 |  5.975179839s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:31 | 200 |  6.039535814s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:37 | 200 |  5.605949784s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:43 | 200 |  6.289499502s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:49 | 200 |   5.83331787s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:55 | 200 |   5.85643737s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:01 | 200 |  5.822546245s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:07 | 200 |  6.249896892s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:13 | 200 |    5.6911034s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:19 | 200 |    6.0428219s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:25 | 200 |   5.57264006s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:30 | 200 |  4.988640788s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:34 | 200 |   4.64227828s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:39 | 200 |   4.41007889s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:44 | 200 |  4.606295726s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:48 | 200 |  4.635780426s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:53 | 200 |  4.686856472s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:58 | 200 |  4.976407078s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:03 | 200 |  5.001600696s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:09 | 200 |  5.486809128s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:13 | 200 |  4.805197748s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:19 | 200 |  5.640329132s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:24 | 200 |  5.153029188s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:30 | 200 |  5.433601033s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:35 | 200 |  5.133185895s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:40 | 200 |  5.502278335s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:46 | 200 |  5.491523638s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:51 | 200 |  4.931678907s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:56 | 200 |  5.266544924s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:02 | 200 |  5.933332881s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:08 | 200 |  5.495537267s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:13 | 200 |  4.836678527s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:18 | 200 |  4.989040022s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:23 | 200 |  5.329049969s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:28 | 200 |  5.234773845s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:33 | 200 |  4.962046216s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:39 | 200 |  5.344885645s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:44 | 200 |  5.375789875s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:49 | 200 |  4.958244411s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:54 | 200 |  5.102096939s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:55 | 200 |  431.821525ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:08 | 200 |  5.749759226s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:13 | 200 |  5.065595582s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:19 | 200 |  5.013637743s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:24 | 200 |  5.711333129s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:29 | 200 |  4.860127326s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:34 | 200 |  5.122585296s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:40 | 200 |  5.861769567s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:46 | 200 |  5.829885673s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:51 | 200 |  5.035542284s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:56 | 200 |  5.155020132s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:01 | 200 |   4.44098361s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:05 | 200 |  4.589102545s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:10 | 200 |  4.652115898s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:15 | 200 |  5.171521688s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:21 | 200 |  5.378501947s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:26 | 200 |  5.372232475s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:32 | 200 |    5.8945508s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:38 | 200 |  5.479668875s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:44 | 200 |  6.097543657s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:48 | 200 |  4.290027126s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:53 | 200 |  5.181262161s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:58 | 200 |  4.265586185s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:02 | 200 |  4.473588432s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:07 | 200 |  4.453949267s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:11 | 200 |  4.559658765s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:16 | 200 |   5.06389266s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:22 | 200 |  5.447513438s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:27 | 200 |  4.776504442s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:31 | 200 |  4.566599545s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:37 | 200 |  5.546128959s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:43 | 200 |  5.752772539s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:49 | 200 |  6.234532801s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:55 | 200 |  5.605820534s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:00 | 200 |  5.677275317s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:06 | 200 |   6.05924743s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:12 | 200 |  5.577431004s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:17 | 200 |  4.968856659s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:23 | 200 |  6.106475063s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:29 | 200 |  5.846626533s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:35 | 200 |  5.776827368s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:41 | 200 |  6.188789661s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:47 | 200 |  6.182290689s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:54 | 200 |   6.60229112s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:00 | 200 |  5.877782518s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:05 | 200 |  5.338586691s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:11 | 200 |  5.789687501s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:17 | 200 |  5.606892329s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:22 | 200 |  5.192657539s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:28 | 200 |   5.75842992s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:34 | 200 |  6.487501692s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:39 | 200 |  5.131103037s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:45 | 200 |  5.845605572s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:51 | 200 |  5.599578609s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:57 | 200 |   5.97033032s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:03 | 200 |  6.003435148s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:09 | 200 |  5.957370711s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:14 | 200 |  5.334350662s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:20 | 200 |  5.814787472s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:26 | 200 |  5.762704932s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:32 | 200 |  5.717739312s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:38 | 200 |  5.819520894s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:44 | 200 |  5.812038753s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:50 | 200 |  6.384932715s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:56 | 200 |     5.746921s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:02 | 200 |  5.892990121s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:08 | 200 |  6.084744143s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:14 | 200 |  6.498642586s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:21 | 200 |   6.40168326s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:26 | 200 |  5.655228905s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:31 | 200 |  4.894272095s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:37 | 200 |  5.943296721s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:43 | 200 |  5.655136067s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:49 | 200 |  5.866005009s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:55 | 200 |  6.253577932s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:01 | 200 |  5.270707366s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:07 | 200 |     6.037823s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:13 | 200 |  6.254477182s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:19 | 200 |  6.380542311s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:25 | 200 |  5.623668814s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:31 | 200 |   6.12393075s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:37 | 200 |  6.187287676s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:43 | 200 |  5.836886167s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:50 | 200 |  6.250676853s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:55 | 200 |  5.485741682s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:01 | 200 |  6.026935536s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:07 | 200 |  5.749063147s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:13 | 200 |  5.955839625s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:19 | 200 |   5.91419045s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:25 | 200 |  5.918028006s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:31 | 200 |  6.101490779s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:37 | 200 |  6.276725098s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:44 | 200 |  6.131594161s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:49 | 200 |  5.796885366s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:56 | 200 |  6.096006088s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:02 | 200 |   6.09558453s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:07 | 200 |  5.447946805s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:13 | 200 |  5.713022478s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:19 | 200 |  5.843924371s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:25 | 200 |  6.095347033s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:31 | 200 |  5.740966368s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:37 | 200 |  6.347683448s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:44 | 200 |  6.435408876s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:50 | 200 |  5.964024895s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:56 | 200 |  6.224970635s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:03 | 200 |  6.686047249s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:09 | 200 |  6.294346113s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:15 | 200 |  6.307390503s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:21 | 200 |  6.118937514s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:27 | 200 |   5.83837977s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:33 | 200 |   5.83770728s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:39 | 200 |  5.526917064s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:45 | 200 |   5.84416987s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:51 | 200 |  5.873270608s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:57 | 200 |  6.339226388s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:03 | 200 |  5.746402708s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:09 | 200 |  5.983170946s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:15 | 200 |  6.066471507s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:21 | 200 |  5.797107428s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:26 | 200 |  5.668402952s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:32 | 200 |  5.310502643s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:37 | 200 |  5.023337994s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:43 | 200 |   6.11730093s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:49 | 200 |  5.963660689s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:55 | 200 |  5.969444444s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:01 | 200 |  6.001424453s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:07 | 200 |  5.927428618s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:13 | 200 |  5.540336534s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:18 | 200 |  5.597380048s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:24 | 200 |  6.127936571s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:30 | 200 |  5.806394948s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:35 | 200 |  4.594824523s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:41 | 200 |  5.833699288s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:46 | 200 |  5.638136462s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:52 | 200 |  5.699265974s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:58 | 200 |   6.06026667s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:04 | 200 |  5.917653905s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:10 | 200 |  5.696973403s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:16 | 200 |  5.567990639s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:21 | 200 |  5.856747188s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:27 | 200 |  5.698476772s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:33 | 200 |  5.964897303s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:39 | 200 |  5.460831696s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:44 | 200 |  5.594724702s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:50 | 200 |  5.402888637s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:55 | 200 |  5.688187971s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:01 | 200 |    5.9337402s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:07 | 200 |  6.021417447s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:13 | 200 |   5.71824651s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:20 | 200 |  6.334822935s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:26 | 200 |  5.925755234s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:32 | 200 |  5.976887504s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:37 | 200 |  5.693066817s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:44 | 200 |  6.240373588s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:49 | 200 |  5.583000507s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:55 | 200 |  6.102539493s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:01 | 200 |  5.672866759s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:06 | 200 |  5.033931269s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:11 | 200 |  4.531799214s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:15 | 200 |   4.41780876s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:20 | 200 |  4.686500532s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:25 | 200 |  4.603118758s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:29 | 200 |  4.647055381s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:34 | 200 |  4.963434784s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:39 | 200 |   4.96611803s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:45 | 200 |  5.447071036s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:50 | 200 |  4.752277675s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:55 | 200 |  5.605813212s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:00 | 200 |  5.100829576s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:06 | 200 |  5.435428538s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:11 | 200 |   5.16640024s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:17 | 200 |  5.601487879s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:22 | 200 |  5.625300132s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:27 | 200 |     4.949263s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:33 | 200 |   5.26491666s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:38 | 200 |   5.84911171s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:44 | 200 |  5.393961401s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:49 | 200 |   4.77867586s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:54 | 200 |  4.928236182s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:59 | 200 |  5.329308572s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:04 | 200 |  5.344585045s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:09 | 200 |  4.906445709s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:15 | 200 |  5.301343198s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:20 | 200 |  5.464103927s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:25 | 200 |  5.037028726s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:30 | 200 |  5.063213861s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:31 | 200 |  436.823719ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:39 | 200 |  444.414215ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:39 | 200 |  333.741721ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:46 | 200 |  6.584590072s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:47 | 200 |  1.076481124s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:54 | 200 |  6.531209513s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:55 | 200 |   997.65372ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:55 | 200 |  101.669931ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:55 | 200 |  184.889674ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:56 | 200 |  351.240253ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:56 | 200 |  332.652948ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:59 | 200 |  2.959259391s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:02 | 200 |  2.904589998s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:05 | 200 |  2.978848365s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:08 | 200 |  3.092754275s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:14 | 200 |  5.053710279s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:19 | 200 |  5.046706195s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:24 | 200 |  5.370630804s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:26 | 200 |  1.264594055s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:31 | 200 |   5.18030291s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:36 | 200 |  5.071941887s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:41 | 200 |  5.249375247s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:43 | 200 |  1.227306032s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:43 | 200 |  130.518129ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:43 | 200 |  129.362381ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:04:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:04:55 | 200 |     556.304µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 14:05:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:05:37 | 200 |      27.382µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.646-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9613017088 required="6.2 GiB"
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.646-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.6 GiB" free_swap="68.9 GiB"
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.646-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.648-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45703"
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.648-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.648-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.648-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:06:33 launchpad ollama[37332]: INFO [main] build info | build=0 commit="unknown" tid="140346057936896" timestamp=1744578393
+Apr 13 14:06:33 launchpad ollama[37332]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140346057936896" timestamp=1744578393 total_threads=16
+Apr 13 14:06:33 launchpad ollama[37332]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45703" tid="140346057936896" timestamp=1744578393
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.899-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:06:33 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:06:33 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:06:33 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:06:33 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:06:34 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:06:34 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:06:34 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:06:34 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:06:34 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:06:34 launchpad ollama[37332]: INFO [main] model loaded | tid="140346057936896" timestamp=1744578394
+Apr 13 14:06:34 launchpad ollama[1754]: time=2025-04-13T14:06:34.902-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:06:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:06:35 | 200 |  1.603741754s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:06:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:06:40 | 200 |  168.741295ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:06:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:06:41 | 200 |  922.237471ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:06:41 launchpad ollama[1754]: time=2025-04-13T14:06:41.327-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:06:41 launchpad ollama[1754]: time=2025-04-13T14:06:41.468-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.150-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.151-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.152-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 34297"
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.152-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.152-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.152-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:06:42 launchpad ollama[37378]: INFO [main] build info | build=0 commit="unknown" tid="140574654976000" timestamp=1744578402
+Apr 13 14:06:42 launchpad ollama[37378]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140574654976000" timestamp=1744578402 total_threads=16
+Apr 13 14:06:42 launchpad ollama[37378]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34297" tid="140574654976000" timestamp=1744578402
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:06:42 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:06:42 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:06:42 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:06:42 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.403-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:06:49 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:06:49 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:06:49 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:06:49 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:06:50 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:06:50 launchpad ollama[37378]: INFO [main] model loaded | tid="140574654976000" timestamp=1744578410
+Apr 13 14:06:50 launchpad ollama[1754]: time=2025-04-13T14:06:50.676-07:00 level=INFO source=server.go:626 msg="llama runner started in 8.52 seconds"
+Apr 13 14:06:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:06:56 | 200 | 15.633349406s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:06:57 launchpad ollama[1754]: time=2025-04-13T14:06:57.018-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:06:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:06:57 | 200 |  879.132204ms |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:06:57 launchpad ollama[1754]: time=2025-04-13T14:06:57.931-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:06:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:06:59 | 200 |  1.297501798s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:07:27 launchpad ollama[1754]: time=2025-04-13T14:07:27.939-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="696.9 MiB"
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.662-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9739173888 required="6.2 GiB"
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.663-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.5 GiB" free_swap="68.9 GiB"
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.663-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.664-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36439"
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.664-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.664-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.664-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:07:28 launchpad ollama[39832]: INFO [main] build info | build=0 commit="unknown" tid="139924029399040" timestamp=1744578448
+Apr 13 14:07:28 launchpad ollama[39832]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139924029399040" timestamp=1744578448 total_threads=16
+Apr 13 14:07:28 launchpad ollama[39832]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36439" tid="139924029399040" timestamp=1744578448
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.915-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:07:28 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:07:28 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:07:28 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:07:28 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:07:29 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:07:29 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:07:29 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:07:29 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:07:29 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:07:29 launchpad ollama[39832]: INFO [main] model loaded | tid="139924029399040" timestamp=1744578449
+Apr 13 14:07:29 launchpad ollama[1754]: time=2025-04-13T14:07:29.918-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:07:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:07:30 | 200 |  2.330926959s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:07:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:07:30 | 200 |  166.981297ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:07:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:07:31 | 200 |  1.139997584s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:07:31 launchpad ollama[1754]: time=2025-04-13T14:07:31.441-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:07:31 launchpad ollama[1754]: time=2025-04-13T14:07:31.586-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.267-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.5 GiB" free_swap="68.9 GiB"
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.267-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.268-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 40181"
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.269-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.269-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.269-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:07:32 launchpad ollama[39868]: INFO [main] build info | build=0 commit="unknown" tid="139770075373568" timestamp=1744578452
+Apr 13 14:07:32 launchpad ollama[39868]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139770075373568" timestamp=1744578452 total_threads=16
+Apr 13 14:07:32 launchpad ollama[39868]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40181" tid="139770075373568" timestamp=1744578452
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:07:32 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:07:32 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:07:32 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:07:32 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.564-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:07:33 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:07:33 launchpad ollama[39868]: INFO [main] model loaded | tid="139770075373568" timestamp=1744578453
+Apr 13 14:07:33 launchpad ollama[1754]: time=2025-04-13T14:07:33.568-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:07:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:07:37 | 200 |   6.08534954s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.167-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="628.2 MiB"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.877-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9679863808 required="6.2 GiB"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.877-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.878-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.879-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41487"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.879-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.879-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.879-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:08:12 launchpad ollama[40988]: INFO [main] build info | build=0 commit="unknown" tid="140242398044160" timestamp=1744578492
+Apr 13 14:08:12 launchpad ollama[40988]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140242398044160" timestamp=1744578492 total_threads=16
+Apr 13 14:08:12 launchpad ollama[40988]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41487" tid="140242398044160" timestamp=1744578492
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:08:13 launchpad ollama[1754]: time=2025-04-13T14:08:13.131-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:08:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:08:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:08:13 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:08:13 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:08:13 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:08:13 launchpad ollama[40988]: INFO [main] model loaded | tid="140242398044160" timestamp=1744578493
+Apr 13 14:08:14 launchpad ollama[1754]: time=2025-04-13T14:08:14.135-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:08:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:16 | 200 |  4.806453389s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:31 | 200 |  175.757221ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:32 | 200 |  172.517442ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:33 | 200 |  1.288758055s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:33 | 200 |   88.791494ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:33 | 200 |  170.831309ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:34 | 200 |   952.11716ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:34 launchpad ollama[1754]: time=2025-04-13T14:08:34.760-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:08:34 launchpad ollama[1754]: time=2025-04-13T14:08:34.911-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.602-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.602-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.603-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 35291"
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.603-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.603-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.603-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:08:35 launchpad ollama[41137]: INFO [main] build info | build=0 commit="unknown" tid="139928022212608" timestamp=1744578515
+Apr 13 14:08:35 launchpad ollama[41137]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139928022212608" timestamp=1744578515 total_threads=16
+Apr 13 14:08:35 launchpad ollama[41137]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35291" tid="139928022212608" timestamp=1744578515
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:08:35 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:08:35 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:08:35 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:08:35 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.904-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:08:36 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:08:36 launchpad ollama[41137]: INFO [main] model loaded | tid="139928022212608" timestamp=1744578516
+Apr 13 14:08:36 launchpad ollama[1754]: time=2025-04-13T14:08:36.908-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:08:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:45 | 200 | 10.705663426s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:10:07 launchpad ollama[1754]: time=2025-04-13T14:10:07.659-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="538.5 MiB"
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.384-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9636937728 required="6.2 GiB"
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.384-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.384-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.385-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38155"
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.386-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.386-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.386-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:10:08 launchpad ollama[43401]: INFO [main] build info | build=0 commit="unknown" tid="140660544999424" timestamp=1744578608
+Apr 13 14:10:08 launchpad ollama[43401]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140660544999424" timestamp=1744578608 total_threads=16
+Apr 13 14:10:08 launchpad ollama[43401]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38155" tid="140660544999424" timestamp=1744578608
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.637-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:10:08 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:10:08 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:10:08 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:10:08 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:10:09 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:10:09 launchpad ollama[43401]: INFO [main] model loaded | tid="140660544999424" timestamp=1744578609
+Apr 13 14:10:09 launchpad ollama[1754]: time=2025-04-13T14:10:09.641-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:10:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:10 | 200 |  2.942440113s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:11 | 200 |  4.015087509s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:13 | 200 |  5.971331023s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:27 | 200 |  785.267226ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:47 | 200 |  533.626034ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:47 | 200 |   525.04085ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:47 | 200 |  579.684908ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:47 | 200 |  933.626698ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:48 | 200 |  1.627546595s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:48 | 200 |   1.05933701s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:48 | 200 |  1.444562882s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:49 | 200 |  1.080242134s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:49 | 200 |  2.005128209s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:50 | 200 |    1.5564227s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:50 | 200 |  1.631973032s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:50 | 200 |  1.428060482s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:53 | 200 |  6.875375788s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:58 | 200 |  4.312372391s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:11:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:11:00 | 200 |  1.872216451s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:11:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:11:30 | 200 |  222.520879ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:11:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:11:30 | 200 |  307.261977ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:02 | 200 |  178.863566ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:02 | 200 |  168.590763ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:03 | 200 |  1.229131089s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:04 | 200 |   84.885653ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:04 | 200 |  165.331666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:05 | 200 |  1.205902473s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:05 | 200 |   84.907812ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:05 | 200 |  165.622158ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:06 | 200 |   810.74577ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:06 | 200 |   85.652738ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:06 | 200 |  165.694432ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:07 | 200 |  1.189130444s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:08 | 200 |   84.733814ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:08 | 200 |  165.722535ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:08 | 200 |  739.458729ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:09 | 200 |    86.31391ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:09 | 200 |  167.013607ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:09 | 200 |  614.948209ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:09 | 200 |   83.654423ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:10 | 200 |  168.015712ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:10 | 200 |  464.896495ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:10 | 200 |  167.517838ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:10 | 200 |  164.307641ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:11 | 200 |  292.698229ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:11 | 200 |  167.910215ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:11 | 200 |  167.252018ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:11 | 200 |  201.751736ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:11 | 200 |  166.702799ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:12 | 200 |  165.305206ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:12 | 200 |  457.359722ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:12 | 200 |   86.846549ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:12 | 200 |  170.970373ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:13 | 200 |  611.425013ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:13 | 200 |   85.038862ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:13 | 200 |  170.081852ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:14 | 200 |  448.816767ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:14 | 200 |     166.767ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:14 | 200 |  165.092509ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:14 | 200 |  179.544664ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:14 | 200 |  166.649812ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:15 | 200 |  165.778387ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:15 | 200 |  877.670041ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:16 | 200 |   84.912889ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:16 | 200 |  165.850521ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:16 | 200 |  207.647324ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:16 | 200 |  166.458928ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:16 | 200 |  166.767253ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:17 | 200 |  686.543694ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:17 | 200 |  166.765162ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:17 | 200 |   167.16146ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:18 | 200 |  293.760466ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:18 | 200 |  166.895627ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:18 | 200 |  166.969389ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:18 | 200 |  257.445145ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:18 | 200 |   166.68461ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:19 | 200 |  165.672921ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:19 | 200 |  209.119235ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:19 | 200 |  167.356696ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:19 | 200 |  166.746511ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:19 | 200 |  227.172747ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:20 | 200 |  124.692001ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:20 | 200 |  166.931886ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:21 | 200 |  653.656061ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.061-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.205-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.871-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.871-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.872-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 33735"
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.872-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.872-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.872-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:12:21 launchpad ollama[44133]: INFO [main] build info | build=0 commit="unknown" tid="140155783323648" timestamp=1744578741
+Apr 13 14:12:21 launchpad ollama[44133]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140155783323648" timestamp=1744578741 total_threads=16
+Apr 13 14:12:21 launchpad ollama[44133]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33735" tid="140155783323648" timestamp=1744578741
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:12:21 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:12:21 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:12:21 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:12:21 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:12:22 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: time=2025-04-13T14:12:22.170-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:12:22 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:12:22 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:12:22 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:12:22 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:12:22 launchpad ollama[44133]: INFO [main] model loaded | tid="140155783323648" timestamp=1744578742
+Apr 13 14:12:23 launchpad ollama[1754]: time=2025-04-13T14:12:23.174-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:12:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:35 | 200 | 14.697469857s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:13:10 launchpad ollama[1754]: time=2025-04-13T14:13:10.721-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="626.2 MiB"
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.416-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9660923904 required="6.2 GiB"
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.416-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.417-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.418-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43871"
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.418-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.418-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.418-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:13:11 launchpad ollama[48447]: INFO [main] build info | build=0 commit="unknown" tid="139872123678720" timestamp=1744578791
+Apr 13 14:13:11 launchpad ollama[48447]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139872123678720" timestamp=1744578791 total_threads=16
+Apr 13 14:13:11 launchpad ollama[48447]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43871" tid="139872123678720" timestamp=1744578791
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.669-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:13:11 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:13:11 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:13:11 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:13:11 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:13:12 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:13:12 launchpad ollama[48447]: INFO [main] model loaded | tid="139872123678720" timestamp=1744578792
+Apr 13 14:13:12 launchpad ollama[1754]: time=2025-04-13T14:13:12.672-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:13:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:12 | 200 |  2.302713961s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:12 | 200 |  158.626226ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:13 | 200 |  937.588661ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:14 | 200 |   77.584479ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:14 | 200 |  158.260904ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:15 | 200 |  934.774772ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:15 | 200 |   77.100094ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:15 | 200 |  157.697444ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:16 | 200 |  779.977236ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:16 | 200 |   76.910703ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:16 | 200 |  158.041031ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:17 | 200 |  1.088751922s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:17 | 200 |   76.992547ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:17 | 200 |  158.428366ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:18 | 200 |  662.584613ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:18 | 200 |    77.92364ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:18 | 200 |  159.108835ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:19 | 200 |  1.096088798s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:19 | 200 |  116.956081ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:20 | 200 |  159.109475ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:20 | 200 |  471.290726ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:20 | 200 |  117.607633ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:20 | 200 |  157.309767ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:21 | 200 |  294.918527ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:21 | 200 |  156.769376ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:21 | 200 |  158.300633ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:21 | 200 |  202.921287ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:21 | 200 |  158.063971ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:22 | 200 |  158.939678ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:22 | 200 |  499.578141ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:22 | 200 |   76.557242ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:22 | 200 |  159.578275ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:23 | 200 |   966.20284ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:23 | 200 |   76.472993ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:23 | 200 |  156.247542ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:24 | 200 |   496.94847ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:24 | 200 |   119.74891ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:24 | 200 |  157.811469ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:24 | 200 |   177.39125ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:25 | 200 |  157.881945ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:25 | 200 |  161.005477ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:26 | 200 |  890.745596ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:26 | 200 |   76.973992ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:26 | 200 |   158.59959ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:26 | 200 |  208.329914ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:26 | 200 |  158.113737ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:26 | 200 |  159.837674ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:27 | 200 |  694.464519ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:27 | 200 |  118.615055ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:27 | 200 |  157.215416ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:28 | 200 |  293.984475ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:28 | 200 |   157.15125ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:28 | 200 |   157.90609ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:28 | 200 |  263.528027ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:29 | 200 |  158.802444ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:29 | 200 |  157.186862ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:29 | 200 |  208.829409ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:29 | 200 |   158.30828ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:29 | 200 |  156.715398ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:29 | 200 |  230.375061ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:30 | 200 |  119.461532ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:30 | 200 |  158.515154ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:31 | 200 |  1.170909187s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:31 launchpad ollama[1754]: time=2025-04-13T14:13:31.644-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:13:31 launchpad ollama[1754]: time=2025-04-13T14:13:31.791-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.2 GiB"
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.486-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.487-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.488-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 37457"
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.488-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.488-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.488-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:13:32 launchpad ollama[48554]: INFO [main] build info | build=0 commit="unknown" tid="139833438691328" timestamp=1744578812
+Apr 13 14:13:32 launchpad ollama[48554]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139833438691328" timestamp=1744578812 total_threads=16
+Apr 13 14:13:32 launchpad ollama[48554]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37457" tid="139833438691328" timestamp=1744578812
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:13:32 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:13:32 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:13:32 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:13:32 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.788-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:13:33 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:13:33 launchpad ollama[48554]: INFO [main] model loaded | tid="139833438691328" timestamp=1744578813
+Apr 13 14:13:33 launchpad ollama[1754]: time=2025-04-13T14:13:33.791-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:13:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:37 | 200 |  6.297320741s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:14:02 launchpad ollama[1754]: time=2025-04-13T14:14:02.705-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="674.2 MiB"
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.405-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9715449856 required="6.2 GiB"
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.405-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.406-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.407-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44305"
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.407-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.407-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.407-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:14:03 launchpad ollama[49647]: INFO [main] build info | build=0 commit="unknown" tid="140377334923264" timestamp=1744578843
+Apr 13 14:14:03 launchpad ollama[49647]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140377334923264" timestamp=1744578843 total_threads=16
+Apr 13 14:14:03 launchpad ollama[49647]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44305" tid="140377334923264" timestamp=1744578843
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.658-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:14:03 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:14:03 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:14:03 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:14:03 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:14:04 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:14:04 launchpad ollama[49647]: INFO [main] model loaded | tid="140377334923264" timestamp=1744578844
+Apr 13 14:14:04 launchpad ollama[1754]: time=2025-04-13T14:14:04.661-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:14:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:04 | 200 |  2.306030695s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:05 | 200 |  166.656088ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:06 | 200 |  1.138165787s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:06 | 200 |   87.422005ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:06 | 200 |  167.784936ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:07 | 200 |  1.183649612s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:07 | 200 |  129.367308ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:07 | 200 |  167.598403ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:08 | 200 |  773.757227ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:08 | 200 |   86.455469ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:08 | 200 |  170.154309ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:10 | 200 |    1.1932753s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:10 | 200 |   85.359108ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:10 | 200 |  169.954418ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:11 | 200 |  700.841168ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:11 | 200 |   86.867979ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:11 | 200 |  167.969871ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:12 | 200 |  824.591738ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:12 | 200 |   84.802423ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:12 | 200 |  166.892698ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:13 | 200 |  466.128794ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:13 | 200 |  126.287256ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:13 | 200 |   168.85059ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:13 | 200 |  294.322349ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:13 | 200 |  168.301309ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:13 | 200 |  167.423887ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:14 | 200 |  200.010107ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:14 | 200 |  164.964417ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:14 | 200 |  168.155695ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:15 | 200 |  500.390545ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:15 | 200 |   85.720084ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:15 | 200 |  167.746751ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:15 | 200 |  656.502986ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:16 | 200 |    86.17137ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:16 | 200 |  167.822759ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:16 | 200 |  452.497976ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:16 | 200 |  167.551093ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:17 | 200 |  166.131167ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:17 | 200 |  178.737683ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:17 | 200 |   169.00327ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:17 | 200 |  165.137246ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:18 | 200 |  884.661976ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:18 | 200 |    86.11287ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:18 | 200 |    169.0752ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:18 | 200 |  207.674445ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:19 | 200 |  167.969393ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:19 | 200 |  167.365528ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:19 | 200 |  690.611287ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:20 | 200 |  126.045026ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:20 | 200 |  168.416199ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:20 | 200 |  295.265574ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:20 | 200 |  166.335214ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:20 | 200 |  166.803242ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:21 | 200 |  259.505072ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:21 | 200 |  167.906408ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:21 | 200 |  166.511764ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:21 | 200 |  206.139104ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:21 | 200 |  167.362357ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:22 | 200 |  166.370259ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:22 | 200 |  227.774431ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:22 | 200 |  125.990828ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:22 | 200 |  168.339496ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:24 | 200 |  1.076270848s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.068-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.212-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.877-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.877-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.878-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 36923"
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.879-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.879-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.879-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:14:24 launchpad ollama[49752]: INFO [main] build info | build=0 commit="unknown" tid="139944885350400" timestamp=1744578864
+Apr 13 14:14:24 launchpad ollama[49752]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139944885350400" timestamp=1744578864 total_threads=16
+Apr 13 14:14:24 launchpad ollama[49752]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36923" tid="139944885350400" timestamp=1744578864
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:14:24 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:14:24 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:14:24 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:14:24 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:14:25 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: time=2025-04-13T14:14:25.175-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:14:25 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:14:25 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:14:25 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:14:25 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:14:26 launchpad ollama[49752]: INFO [main] model loaded | tid="139944885350400" timestamp=1744578866
+Apr 13 14:14:26 launchpad ollama[1754]: time=2025-04-13T14:14:26.178-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:14:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:43 | 200 | 18.938773782s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:15:23 launchpad ollama[1754]: time=2025-04-13T14:15:23.831-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="685.9 MiB"
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.537-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9727639552 required="6.2 GiB"
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.537-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.537-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.538-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37539"
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.538-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.539-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.539-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:15:24 launchpad ollama[55078]: INFO [main] build info | build=0 commit="unknown" tid="140495880671232" timestamp=1744578924
+Apr 13 14:15:24 launchpad ollama[55078]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140495880671232" timestamp=1744578924 total_threads=16
+Apr 13 14:15:24 launchpad ollama[55078]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37539" tid="140495880671232" timestamp=1744578924
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.789-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:15:24 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:15:24 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:15:24 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:15:24 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:15:25 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:15:25 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:15:25 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:15:25 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:15:25 launchpad ollama[55078]: INFO [main] model loaded | tid="140495880671232" timestamp=1744578925
+Apr 13 14:15:25 launchpad ollama[1754]: time=2025-04-13T14:15:25.793-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:15:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:25 | 200 |  2.314727864s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:26 | 200 |  171.157155ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:27 | 200 |  1.210687591s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:27 | 200 |    89.89124ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:27 | 200 |  128.727309ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:28 | 200 |  1.286389493s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:28 | 200 |   92.404352ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:29 | 200 |  172.220329ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:29 | 200 |  803.624256ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:30 | 200 |   88.948481ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:30 | 200 |  168.204584ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:31 | 200 |  1.223560146s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:31 | 200 |   90.375008ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:31 | 200 |   172.41442ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:32 | 200 |   751.93462ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:32 | 200 |   89.094999ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:32 | 200 |  170.968699ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:33 | 200 |  841.495071ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:33 | 200 |   91.642477ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:33 | 200 |  169.916497ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:34 | 200 |  470.730912ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:34 | 200 |  129.556752ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:34 | 200 |  172.025034ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:35 | 200 |  296.047595ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:35 | 200 |  169.916791ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:35 | 200 |   170.46005ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:35 | 200 |   202.58232ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:35 | 200 |  173.493465ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:35 | 200 |  172.141211ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:36 | 200 |  512.778462ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:36 | 200 |   89.326113ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:36 | 200 |  169.889956ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:37 | 200 |  660.200756ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:37 | 200 |   88.097633ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:37 | 200 |  171.044793ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:38 | 200 |  473.604906ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:38 | 200 |  170.801219ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:38 | 200 |  170.458333ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:38 | 200 |  178.146144ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:38 | 200 |  168.480833ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:39 | 200 |  172.921608ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:39 | 200 |  678.462695ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:39 | 200 |    88.61989ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:40 | 200 |  171.022436ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:40 | 200 |  208.262025ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:40 | 200 |  170.280019ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:40 | 200 |   170.34308ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:41 | 200 |   699.01613ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:41 | 200 |  129.198849ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:41 | 200 |  170.989689ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:41 | 200 |  296.178441ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:42 | 200 |  170.188273ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:42 | 200 |  171.127482ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:42 | 200 |  263.126762ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:42 | 200 |  168.281645ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:42 | 200 |  170.153962ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:43 | 200 |  210.320573ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:43 | 200 |  169.909425ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:43 | 200 |  172.835727ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:43 | 200 |   228.19585ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:43 | 200 |  129.579855ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:44 | 200 |  171.002463ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:45 | 200 |  1.143145453s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:45 launchpad ollama[1754]: time=2025-04-13T14:15:45.336-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:15:45 launchpad ollama[1754]: time=2025-04-13T14:15:45.486-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.188-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.189-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.190-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 46877"
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.190-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.190-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.190-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:15:46 launchpad ollama[55222]: INFO [main] build info | build=0 commit="unknown" tid="140188558594048" timestamp=1744578946
+Apr 13 14:15:46 launchpad ollama[55222]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140188558594048" timestamp=1744578946 total_threads=16
+Apr 13 14:15:46 launchpad ollama[55222]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46877" tid="140188558594048" timestamp=1744578946
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:15:46 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:15:46 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:15:46 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:15:46 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.492-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:15:47 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:15:47 launchpad ollama[55222]: INFO [main] model loaded | tid="140188558594048" timestamp=1744578947
+Apr 13 14:15:47 launchpad ollama[1754]: time=2025-04-13T14:15:47.496-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 13 14:16:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:16:03 | 200 | 18.466346928s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:17:17 launchpad ollama[1754]: time=2025-04-13T14:17:17.360-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="658.0 MiB"
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.061-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9719382016 required="6.2 GiB"
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.061-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.061-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.062-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41757"
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.063-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.063-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.063-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:17:18 launchpad ollama[60114]: INFO [main] build info | build=0 commit="unknown" tid="140619095724032" timestamp=1744579038
+Apr 13 14:17:18 launchpad ollama[60114]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140619095724032" timestamp=1744579038 total_threads=16
+Apr 13 14:17:18 launchpad ollama[60114]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41757" tid="140619095724032" timestamp=1744579038
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.314-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:17:18 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:17:18 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:17:18 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:17:18 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:17:19 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:17:19 launchpad ollama[60114]: INFO [main] model loaded | tid="140619095724032" timestamp=1744579039
+Apr 13 14:17:19 launchpad ollama[1754]: time=2025-04-13T14:17:19.318-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:17:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:19 | 200 |  2.319222132s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:19 | 200 |   176.44222ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:20 | 200 |  1.063898495s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:20 | 200 |   97.456771ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:21 | 200 |  177.324342ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:22 | 200 |  1.066073961s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:22 | 200 |   95.389306ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:22 | 200 |  177.712901ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:23 | 200 |  778.405138ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:23 | 200 |   96.984762ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:23 | 200 |  178.013899ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:24 | 200 |   984.17276ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:24 | 200 |    96.39856ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:24 | 200 |  175.885764ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:25 | 200 |  707.201969ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:25 | 200 |   97.504063ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:25 | 200 |  177.951254ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:26 | 200 |  1.126825267s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:27 | 200 |   98.129709ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:27 | 200 |  177.913595ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:27 | 200 |   515.08808ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:27 | 200 |   96.557575ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:27 | 200 |  138.549689ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:28 | 200 |  254.054972ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:28 | 200 |  177.425297ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:28 | 200 |  138.834882ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:28 | 200 |  161.296773ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:28 | 200 |  137.164197ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:29 | 200 |  179.224766ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:29 | 200 |  472.491769ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:29 | 200 |  136.947242ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:29 | 200 |  178.195423ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:30 | 200 |  618.304068ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:30 | 200 |   98.411077ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:30 | 200 |  177.763165ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:31 | 200 |  455.585747ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:31 | 200 |  178.780445ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:31 | 200 |  180.004825ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:31 | 200 |  178.484068ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:31 | 200 |  179.638942ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:32 | 200 |  179.764081ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:32 | 200 |  671.106515ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:32 | 200 |   96.062121ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:33 | 200 |  179.441546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:33 | 200 |  208.985995ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:33 | 200 |  178.031172ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:33 | 200 |  179.445471ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:34 | 200 |  698.184353ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:34 | 200 |  138.240377ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:34 | 200 |  178.092645ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:35 | 200 |  294.519435ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:35 | 200 |  179.066968ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:35 | 200 |  177.470116ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:35 | 200 |  262.179455ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:35 | 200 |  178.519184ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:36 | 200 |  178.901163ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:36 | 200 |  210.928807ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:36 | 200 |  177.787493ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:36 | 200 |  176.451428ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:36 | 200 |  228.905948ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:37 | 200 |  137.155408ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:37 | 200 |  177.565041ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:38 | 200 |  1.417654354s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:38 launchpad ollama[1754]: time=2025-04-13T14:17:38.835-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:17:38 launchpad ollama[1754]: time=2025-04-13T14:17:38.977-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.663-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.663-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.664-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 34649"
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.664-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.664-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.664-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:17:39 launchpad ollama[60222]: INFO [main] build info | build=0 commit="unknown" tid="139952822145024" timestamp=1744579059
+Apr 13 14:17:39 launchpad ollama[60222]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139952822145024" timestamp=1744579059 total_threads=16
+Apr 13 14:17:39 launchpad ollama[60222]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34649" tid="139952822145024" timestamp=1744579059
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:17:39 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:17:39 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:17:39 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:17:39 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.961-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:17:40 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:17:40 launchpad ollama[60222]: INFO [main] model loaded | tid="139952822145024" timestamp=1744579060
+Apr 13 14:17:40 launchpad ollama[1754]: time=2025-04-13T14:17:40.964-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:17:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:55 | 200 | 16.667132647s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:18:48 launchpad ollama[1754]: time=2025-04-13T14:18:48.847-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="663.6 MiB"
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.554-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9746251776 required="6.2 GiB"
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.554-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.554-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.555-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43257"
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.555-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.555-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.555-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:18:49 launchpad ollama[64159]: INFO [main] build info | build=0 commit="unknown" tid="140056496005120" timestamp=1744579129
+Apr 13 14:18:49 launchpad ollama[64159]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140056496005120" timestamp=1744579129 total_threads=16
+Apr 13 14:18:49 launchpad ollama[64159]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43257" tid="140056496005120" timestamp=1744579129
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.806-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:18:49 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:18:49 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:18:49 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:18:49 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:18:50 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:18:50 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:18:50 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:18:50 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:18:50 launchpad ollama[64159]: INFO [main] model loaded | tid="140056496005120" timestamp=1744579130
+Apr 13 14:18:50 launchpad ollama[1754]: time=2025-04-13T14:18:50.810-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:18:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:50 | 200 |  2.313253254s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:51 | 200 |  166.536958ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:52 | 200 |  1.096044537s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:52 | 200 |   167.80689ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:52 | 200 |  167.278703ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:53 | 200 |  1.097934513s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:53 | 200 |   85.764496ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:53 | 200 |   171.23403ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:54 | 200 |  812.408253ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:54 | 200 |   86.286988ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:55 | 200 |  167.250607ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:56 | 200 |  1.079654534s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:56 | 200 |   86.127655ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:56 | 200 |  167.522467ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:57 | 200 |  745.915362ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:57 | 200 |   86.271624ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:57 | 200 |  167.038229ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:58 | 200 |   880.92768ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:58 | 200 |   87.246529ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:58 | 200 |  127.160666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:59 | 200 |  467.429797ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:59 | 200 |  127.379213ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:59 | 200 |  166.864857ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:59 | 200 |   293.54243ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:59 | 200 |  167.623815ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:00 | 200 |  168.778638ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:00 | 200 |  200.019556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:00 | 200 |  168.020925ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:00 | 200 |  168.775122ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:01 | 200 |  459.824909ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:01 | 200 |   86.685794ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:01 | 200 |  168.194174ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:02 | 200 |  963.723567ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:02 | 200 |   87.690111ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:02 | 200 |  166.403028ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:02 | 200 |   454.23558ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:03 | 200 |  167.392722ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:03 | 200 |   168.62208ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:03 | 200 |  178.556716ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:03 | 200 |  167.639075ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:03 | 200 |  168.254254ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:04 | 200 |  885.088514ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:04 | 200 |    87.53913ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:05 | 200 |  167.769488ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:05 | 200 |  207.318696ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:05 | 200 |  172.321189ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:05 | 200 |  168.721826ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:06 | 200 |  692.854417ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:06 | 200 |  126.812033ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:06 | 200 |   168.76819ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:06 | 200 |  294.056719ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:07 | 200 |  167.403955ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:07 | 200 |  167.626091ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:07 | 200 |  259.491471ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:07 | 200 |  168.224123ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:07 | 200 |  167.666895ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:08 | 200 |  207.745311ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:08 | 200 |  166.485726ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:08 | 200 |  169.049776ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:08 | 200 |   226.26969ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:08 | 200 |  128.590684ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:09 | 200 |  166.536268ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:10 | 200 |  1.256961847s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:10 launchpad ollama[1754]: time=2025-04-13T14:19:10.439-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:19:10 launchpad ollama[1754]: time=2025-04-13T14:19:10.580-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.248-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.248-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.249-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 43895"
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.249-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.249-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.249-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:19:11 launchpad ollama[64264]: INFO [main] build info | build=0 commit="unknown" tid="139804451819520" timestamp=1744579151
+Apr 13 14:19:11 launchpad ollama[64264]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139804451819520" timestamp=1744579151 total_threads=16
+Apr 13 14:19:11 launchpad ollama[64264]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43895" tid="139804451819520" timestamp=1744579151
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:19:11 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:19:11 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:19:11 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:19:11 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.557-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:19:12 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:19:12 launchpad ollama[64264]: INFO [main] model loaded | tid="139804451819520" timestamp=1744579152
+Apr 13 14:19:12 launchpad ollama[1754]: time=2025-04-13T14:19:12.561-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 13 14:19:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:29 | 200 | 19.527197996s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:22:07 launchpad ollama[1754]: time=2025-04-13T14:22:07.293-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="630.1 MiB"
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.017-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9711058944 required="6.2 GiB"
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.017-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.017-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.018-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39385"
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.018-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.018-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.018-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:22:08 launchpad ollama[69406]: INFO [main] build info | build=0 commit="unknown" tid="139926906634240" timestamp=1744579328
+Apr 13 14:22:08 launchpad ollama[69406]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139926906634240" timestamp=1744579328 total_threads=16
+Apr 13 14:22:08 launchpad ollama[69406]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39385" tid="139926906634240" timestamp=1744579328
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.269-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:22:08 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:22:08 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:22:08 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:22:08 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:22:08 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:22:09 launchpad ollama[69406]: INFO [main] model loaded | tid="139926906634240" timestamp=1744579329
+Apr 13 14:22:09 launchpad ollama[1754]: time=2025-04-13T14:22:09.273-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:22:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:09 | 200 |  2.353418297s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:09 | 200 |  188.397914ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:10 | 200 |  1.191260053s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:10 | 200 |   69.236879ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:11 | 200 |   189.68103ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:12 | 200 |  1.273561948s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:12 | 200 |   110.35195ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:12 | 200 |  189.628454ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:13 | 200 |  774.245212ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:13 | 200 |  108.251793ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:13 | 200 |  192.123101ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:14 | 200 |  1.016973079s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:14 | 200 |  109.635079ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:15 | 200 |   190.88805ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:15 | 200 |  701.048332ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:16 | 200 |  109.971343ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:16 | 200 |  191.383176ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:17 | 200 |  1.094546905s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:17 | 200 |   67.951011ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:17 | 200 |  189.827893ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:18 | 200 |  510.153815ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:18 | 200 |  148.625137ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:18 | 200 |  190.730312ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:18 | 200 |  293.901948ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:18 | 200 |   193.32415ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:19 | 200 |   191.02034ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:19 | 200 |  201.141419ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:19 | 200 |  190.349755ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:19 | 200 |  193.392909ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:20 | 200 |  460.750557ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:20 | 200 |   108.52019ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:20 | 200 |   190.12915ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:21 | 200 |  616.629895ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:21 | 200 |  108.459063ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:21 | 200 |  191.335463ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:21 | 200 |  454.891338ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:22 | 200 |  190.275966ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:22 | 200 |  190.391963ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:22 | 200 |  179.505602ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:22 | 200 |  191.355618ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:22 | 200 |  189.507433ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:23 | 200 |  886.072943ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:23 | 200 |   109.44457ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:24 | 200 |  190.639759ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:24 | 200 |  207.415035ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:24 | 200 |  190.483674ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:24 | 200 |  190.162139ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:25 | 200 |  733.686797ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:25 | 200 |  148.325524ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:25 | 200 |  190.494525ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:26 | 200 |  294.843235ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:26 | 200 |  190.511629ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:26 | 200 |  189.528435ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:26 | 200 |  260.039953ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:26 | 200 |  191.069802ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:27 | 200 |  190.340179ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:27 | 200 |  210.637842ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:27 | 200 |  190.887445ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:27 | 200 |  190.896501ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:27 | 200 |  227.558483ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:28 | 200 |  150.969037ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:28 | 200 |  191.919331ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:29 | 200 |  1.447125893s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.028-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.175-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.860-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.860-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.861-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 38375"
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.861-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.861-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.861-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:22:30 launchpad ollama[69518]: INFO [main] build info | build=0 commit="unknown" tid="140663304839168" timestamp=1744579350
+Apr 13 14:22:30 launchpad ollama[69518]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140663304839168" timestamp=1744579350 total_threads=16
+Apr 13 14:22:30 launchpad ollama[69518]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38375" tid="140663304839168" timestamp=1744579350
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:22:30 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:22:30 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:22:30 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:22:30 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: time=2025-04-13T14:22:31.156-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:22:31 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:22:31 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:22:31 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:22:31 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:22:31 launchpad ollama[69518]: INFO [main] model loaded | tid="140663304839168" timestamp=1744579351
+Apr 13 14:22:32 launchpad ollama[1754]: time=2025-04-13T14:22:32.160-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:22:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:42 | 200 | 12.304260015s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:25:45 launchpad ollama[1754]: time=2025-04-13T14:25:45.550-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="627.3 MiB"
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.268-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9722855424 required="6.2 GiB"
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.268-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.268-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.269-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40747"
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.269-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.269-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.270-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:25:46 launchpad ollama[72393]: INFO [main] build info | build=0 commit="unknown" tid="140099103772672" timestamp=1744579546
+Apr 13 14:25:46 launchpad ollama[72393]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140099103772672" timestamp=1744579546 total_threads=16
+Apr 13 14:25:46 launchpad ollama[72393]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40747" tid="140099103772672" timestamp=1744579546
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.521-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:25:46 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:25:46 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:25:46 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:25:46 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:25:47 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:25:47 launchpad ollama[72393]: INFO [main] model loaded | tid="140099103772672" timestamp=1744579547
+Apr 13 14:25:47 launchpad ollama[1754]: time=2025-04-13T14:25:47.524-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:25:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:47 | 200 |  2.341834992s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:47 | 200 |  190.937335ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:49 | 200 |  1.230380296s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:49 | 200 |    109.7556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:49 | 200 |  189.152872ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:50 | 200 |  1.271804326s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:50 | 200 |  110.115751ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:51 | 200 |  192.545603ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:51 | 200 |  776.426211ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:51 | 200 |  108.764611ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:52 | 200 |  190.287817ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:53 | 200 |  1.199677852s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:53 | 200 |  109.310912ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:53 | 200 |  190.554585ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:54 | 200 |  704.312548ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:54 | 200 |  108.305441ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:54 | 200 |  191.776949ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:55 | 200 |  665.574641ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:55 | 200 |   109.39452ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:55 | 200 |  190.428114ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:56 | 200 |  470.393571ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:56 | 200 |  151.581252ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:56 | 200 |  190.686979ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:56 | 200 |  293.769644ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:57 | 200 |  191.687966ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:57 | 200 |  189.609013ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:57 | 200 |  200.416672ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:57 | 200 |  189.762234ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:57 | 200 |  190.790831ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:58 | 200 |  501.043779ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:58 | 200 |   109.75825ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:58 | 200 |  191.522784ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:59 | 200 |  661.077699ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:59 | 200 |  109.085825ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:59 | 200 |  191.706205ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:00 | 200 |  482.784477ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:00 | 200 |  152.737928ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:00 | 200 |  194.511255ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:00 | 200 |  180.820352ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:00 | 200 |   193.58614ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:01 | 200 |  192.450955ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:01 | 200 |  891.624352ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:02 | 200 |  110.938891ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:02 | 200 |  191.540928ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:02 | 200 |  209.958807ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:02 | 200 |  195.275862ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:02 | 200 |  193.021109ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:03 | 200 |  711.733195ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:03 | 200 |  194.004708ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:04 | 200 |  192.805524ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:04 | 200 |  295.026947ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:04 | 200 |  191.531509ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:04 | 200 |  191.688582ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:04 | 200 |  261.151826ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:05 | 200 |  192.674942ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:05 | 200 |  192.044451ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:05 | 200 |  211.177455ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:05 | 200 |  193.054096ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:05 | 200 |  191.187125ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:06 | 200 |  229.448972ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:06 | 200 |  151.241535ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:06 | 200 |  192.661309ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:08 | 200 |    1.3791092s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:08 launchpad ollama[1754]: time=2025-04-13T14:26:08.196-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:26:08 launchpad ollama[1754]: time=2025-04-13T14:26:08.340-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.2 GiB"
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.063-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.063-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.064-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 33513"
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.064-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.064-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.064-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:26:09 launchpad ollama[72590]: INFO [main] build info | build=0 commit="unknown" tid="140251896848384" timestamp=1744579569
+Apr 13 14:26:09 launchpad ollama[72590]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140251896848384" timestamp=1744579569 total_threads=16
+Apr 13 14:26:09 launchpad ollama[72590]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33513" tid="140251896848384" timestamp=1744579569
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:26:09 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:26:09 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:26:09 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:26:09 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.365-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:26:10 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:26:10 launchpad ollama[72590]: INFO [main] model loaded | tid="140251896848384" timestamp=1744579570
+Apr 13 14:26:10 launchpad ollama[1754]: time=2025-04-13T14:26:10.369-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:26:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:27 | 200 | 19.373967531s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:27:35 launchpad ollama[1754]: time=2025-04-13T14:27:35.435-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="602.6 MiB"
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.128-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9682288640 required="6.2 GiB"
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.128-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.128-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.130-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46831"
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.130-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.130-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.130-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:27:36 launchpad ollama[77172]: INFO [main] build info | build=0 commit="unknown" tid="140343862452224" timestamp=1744579656
+Apr 13 14:27:36 launchpad ollama[77172]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140343862452224" timestamp=1744579656 total_threads=16
+Apr 13 14:27:36 launchpad ollama[77172]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46831" tid="140343862452224" timestamp=1744579656
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.382-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:27:36 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:27:36 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:27:36 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:27:36 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:27:37 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:27:37 launchpad ollama[77172]: INFO [main] model loaded | tid="140343862452224" timestamp=1744579657
+Apr 13 14:27:37 launchpad ollama[1754]: time=2025-04-13T14:27:37.386-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:27:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:37 | 200 |  2.296962868s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:37 | 200 |  166.379237ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:38 | 200 |   1.21585603s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:39 | 200 |   46.271231ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:39 | 200 |  169.328422ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:40 | 200 |  1.262484925s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:40 | 200 |   87.147973ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:40 | 200 |   168.67223ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:41 | 200 |  808.248556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:41 | 200 |   86.403174ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:41 | 200 |  169.081056ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:43 | 200 |  1.189619467s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:43 | 200 |   86.619592ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:43 | 200 |  169.433753ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:43 | 200 |  696.491187ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:44 | 200 |   86.183906ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:44 | 200 |  167.617281ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:45 | 200 |   855.40085ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:45 | 200 |   87.150238ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:45 | 200 |  166.972119ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:45 | 200 |  505.714136ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:46 | 200 |  129.234185ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:46 | 200 |  129.921396ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:46 | 200 |  293.501567ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:46 | 200 |  128.919895ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:46 | 200 |   128.06142ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:46 | 200 |  200.198491ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:47 | 200 |  169.974946ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:47 | 200 |  128.035911ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:47 | 200 |  427.187042ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:47 | 200 |  127.275077ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:48 | 200 |  168.862751ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:48 | 200 |  612.763069ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:48 | 200 |   85.513107ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:48 | 200 |  167.522892ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:49 | 200 |   448.75843ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:49 | 200 |  168.272483ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:49 | 200 |  169.193847ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:49 | 200 |  180.487149ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:50 | 200 |  166.390475ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:50 | 200 |    169.2984ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:50 | 200 |  704.896492ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:51 | 200 |  128.902658ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:51 | 200 |  167.052734ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:51 | 200 |  207.566074ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:51 | 200 |  167.267084ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:51 | 200 |  167.128932ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:52 | 200 |  687.098121ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:52 | 200 |  126.909354ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:52 | 200 |  172.657005ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:53 | 200 |  292.500597ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:53 | 200 |  166.074656ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:53 | 200 |  167.533832ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:53 | 200 |  257.505546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:53 | 200 |  167.795423ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:54 | 200 |   167.70183ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:54 | 200 |  207.628208ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:54 | 200 |  169.192335ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:54 | 200 |   169.29525ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:54 | 200 |  226.537473ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:55 | 200 |   127.81057ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:55 | 200 |  166.385999ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:56 | 200 |  1.192449867s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:56 launchpad ollama[1754]: time=2025-04-13T14:27:56.579-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:27:56 launchpad ollama[1754]: time=2025-04-13T14:27:56.720-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.408-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.408-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.409-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 37371"
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.410-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.410-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.410-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:27:57 launchpad ollama[77315]: INFO [main] build info | build=0 commit="unknown" tid="140009756614656" timestamp=1744579677
+Apr 13 14:27:57 launchpad ollama[77315]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140009756614656" timestamp=1744579677 total_threads=16
+Apr 13 14:27:57 launchpad ollama[77315]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37371" tid="140009756614656" timestamp=1744579677
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:27:57 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:27:57 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:27:57 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:27:57 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.704-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:27:58 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:27:58 launchpad ollama[77315]: INFO [main] model loaded | tid="140009756614656" timestamp=1744579678
+Apr 13 14:27:58 launchpad ollama[1754]: time=2025-04-13T14:27:58.708-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:28:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:28:16 | 200 | 19.436705671s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:30:17 launchpad ollama[1754]: time=2025-04-13T14:30:17.991-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="616.6 MiB"
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.691-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9715843072 required="6.2 GiB"
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.691-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.692-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.693-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39737"
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.693-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.693-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.693-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:30:18 launchpad ollama[82117]: INFO [main] build info | build=0 commit="unknown" tid="140471656603648" timestamp=1744579818
+Apr 13 14:30:18 launchpad ollama[82117]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140471656603648" timestamp=1744579818 total_threads=16
+Apr 13 14:30:18 launchpad ollama[82117]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39737" tid="140471656603648" timestamp=1744579818
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.945-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:30:18 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:30:18 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:30:18 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:30:18 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:30:19 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:30:19 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:30:19 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:30:19 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:30:19 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:30:19 launchpad ollama[82117]: INFO [main] model loaded | tid="140471656603648" timestamp=1744579819
+Apr 13 14:30:19 launchpad ollama[1754]: time=2025-04-13T14:30:19.948-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:30:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:20 | 200 |  2.302116458s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:20 | 200 |  173.378433ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:21 | 200 |  1.192322595s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:21 | 200 |   45.186387ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:21 | 200 |  168.571201ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:22 | 200 |  1.232066125s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:23 | 200 |   85.839851ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:23 | 200 |   166.24997ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:24 | 200 |   768.46796ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:24 | 200 |   87.257239ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:24 | 200 |   167.29942ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:25 | 200 |  1.189202392s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:25 | 200 |   87.234786ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:25 | 200 |   166.58056ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:26 | 200 |  738.995533ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:26 | 200 |   86.575609ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:26 | 200 |  168.204617ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:27 | 200 |  822.510179ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:27 | 200 |   86.251429ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:27 | 200 |  166.460825ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:28 | 200 |  506.095655ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:28 | 200 |  126.363774ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:28 | 200 |  167.313687ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:28 | 200 |  290.438735ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:29 | 200 |  167.591176ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:29 | 200 |  167.484531ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:29 | 200 |  201.958578ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:29 | 200 |  165.800071ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:29 | 200 |  170.149609ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:30 | 200 |  458.605678ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:30 | 200 |   86.575035ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:30 | 200 |  168.349579ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:31 | 200 |  613.360404ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:31 | 200 |   87.786444ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:31 | 200 |  167.704186ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:31 | 200 |  450.809127ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:32 | 200 |  166.860723ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:32 | 200 |  168.386827ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:32 | 200 |   177.15343ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:32 | 200 |  170.958186ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:32 | 200 |  172.114221ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:33 | 200 |  712.021454ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:33 | 200 |   86.313181ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:33 | 200 |   167.87855ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:34 | 200 |  208.042758ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:34 | 200 |  168.749456ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:34 | 200 |  168.356369ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:35 | 200 |  704.063972ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:35 | 200 |  127.699159ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:35 | 200 |  168.709584ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:35 | 200 |   294.68536ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:35 | 200 |  167.959617ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:36 | 200 |  167.509728ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:36 | 200 |  263.176951ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:36 | 200 |  168.508643ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:36 | 200 |  168.436998ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:36 | 200 |  210.926308ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:37 | 200 |  167.421001ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:37 | 200 |  167.531312ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:37 | 200 |  230.263169ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:37 | 200 |  127.996815ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:38 | 200 |   167.05686ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:39 | 200 |  1.466588283s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:39 launchpad ollama[1754]: time=2025-04-13T14:30:39.618-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:30:39 launchpad ollama[1754]: time=2025-04-13T14:30:39.762-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.461-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.461-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.462-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 39957"
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.462-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.462-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.463-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:30:40 launchpad ollama[82246]: INFO [main] build info | build=0 commit="unknown" tid="139794672869376" timestamp=1744579840
+Apr 13 14:30:40 launchpad ollama[82246]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139794672869376" timestamp=1744579840 total_threads=16
+Apr 13 14:30:40 launchpad ollama[82246]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39957" tid="139794672869376" timestamp=1744579840
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:30:40 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:30:40 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:30:40 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:30:40 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.759-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:30:41 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:30:41 launchpad ollama[82246]: INFO [main] model loaded | tid="139794672869376" timestamp=1744579841
+Apr 13 14:30:41 launchpad ollama[1754]: time=2025-04-13T14:30:41.762-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:31:15 | 200 | 36.216183489s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:34:40 launchpad ollama[1754]: time=2025-04-13T14:34:40.036-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:34:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:34:43 | 200 |  3.813950747s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.103-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="584.3 MiB"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.803-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9667280896 required="6.2 GiB"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.803-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.803-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.804-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44545"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.804-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.804-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.805-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:34:53 launchpad ollama[93210]: INFO [main] build info | build=0 commit="unknown" tid="139931029549056" timestamp=1744580093
+Apr 13 14:34:53 launchpad ollama[93210]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139931029549056" timestamp=1744580093 total_threads=16
+Apr 13 14:34:53 launchpad ollama[93210]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44545" tid="139931029549056" timestamp=1744580093
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:34:54 launchpad ollama[1754]: time=2025-04-13T14:34:54.056-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:34:54 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:34:54 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:34:54 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:34:54 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:34:54 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:34:54 launchpad ollama[93210]: INFO [main] model loaded | tid="139931029549056" timestamp=1744580094
+Apr 13 14:34:55 launchpad ollama[1754]: time=2025-04-13T14:34:55.060-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:34:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:34:58 | 200 |  5.802050132s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:34:58 launchpad ollama[1754]: time=2025-04-13T14:34:58.887-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.028-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.764-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.765-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.766-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 38355"
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.766-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.766-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.766-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:34:59 launchpad ollama[93250]: INFO [main] build info | build=0 commit="unknown" tid="139818907459584" timestamp=1744580099
+Apr 13 14:34:59 launchpad ollama[93250]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139818907459584" timestamp=1744580099 total_threads=16
+Apr 13 14:34:59 launchpad ollama[93250]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38355" tid="139818907459584" timestamp=1744580099
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:34:59 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:34:59 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:34:59 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:34:59 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: time=2025-04-13T14:35:00.066-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:35:00 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:35:00 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:35:00 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:35:00 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:35:00 launchpad ollama[93250]: INFO [main] model loaded | tid="139818907459584" timestamp=1744580100
+Apr 13 14:35:01 launchpad ollama[1754]: time=2025-04-13T14:35:01.069-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:35:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:10 | 200 | 11.258725546s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:35:10 launchpad ollama[1754]: time=2025-04-13T14:35:10.314-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="608.3 MiB"
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.026-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9667280896 required="6.2 GiB"
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.027-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.027-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.028-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43095"
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.028-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.028-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.028-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:35:11 launchpad ollama[95674]: INFO [main] build info | build=0 commit="unknown" tid="140445655883776" timestamp=1744580111
+Apr 13 14:35:11 launchpad ollama[95674]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140445655883776" timestamp=1744580111 total_threads=16
+Apr 13 14:35:11 launchpad ollama[95674]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43095" tid="140445655883776" timestamp=1744580111
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.279-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:35:11 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:35:11 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:35:11 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:35:11 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:35:11 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:35:11 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:35:11 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:35:11 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:35:11 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:35:12 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:35:12 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:35:12 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:35:12 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:35:12 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:35:12 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:35:12 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:35:12 launchpad ollama[95674]: INFO [main] model loaded | tid="140445655883776" timestamp=1744580112
+Apr 13 14:35:12 launchpad ollama[1754]: time=2025-04-13T14:35:12.282-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:35:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:12 | 200 |  2.314496336s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:12 | 200 |  178.116447ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:13 | 200 |    1.1959879s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:13 | 200 |   96.382327ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:14 | 200 |  180.869841ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:15 | 200 |   1.23697766s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:15 | 200 |   96.504888ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:15 | 200 |  177.413049ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:16 | 200 |  776.157858ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:16 | 200 |   97.160568ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:16 | 200 |  178.191536ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:17 | 200 |  1.144426595s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:18 | 200 |   96.176572ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:18 | 200 |  178.716816ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:18 | 200 |  742.293388ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:19 | 200 |   98.626052ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:19 | 200 |  180.902001ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:20 | 200 |  962.743394ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:20 | 200 |   56.872956ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:20 | 200 |  177.506766ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:20 | 200 |  467.377203ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:21 | 200 |  139.222363ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:21 | 200 |  179.569167ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:21 | 200 |  293.263195ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:21 | 200 |  180.419679ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:21 | 200 |  178.266896ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:22 | 200 |  201.409315ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:22 | 200 |  180.054981ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:22 | 200 |  179.706601ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:22 | 200 |  459.576033ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:23 | 200 |   98.006545ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:23 | 200 |  178.718313ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:23 | 200 |  615.969164ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:24 | 200 |   97.232233ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:24 | 200 |  179.183213ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:24 | 200 |   492.92924ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:24 | 200 |  136.994971ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:25 | 200 |  177.437023ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:25 | 200 |  179.372754ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:25 | 200 |  178.921879ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:25 | 200 |  176.965531ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:26 | 200 |   666.45808ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:26 | 200 |   96.754528ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:26 | 200 |  179.713556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:26 | 200 |  207.324831ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:26 | 200 |  177.892347ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:27 | 200 |  178.809212ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:27 | 200 |   692.82106ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:28 | 200 |  178.176721ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:28 | 200 |  179.007007ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:28 | 200 |  294.518091ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:28 | 200 |  178.252741ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:28 | 200 |  179.286781ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:29 | 200 |  260.886672ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:29 | 200 |  178.373155ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:29 | 200 |   178.31738ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:29 | 200 |  208.019923ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:29 | 200 |  179.314934ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:30 | 200 |  177.517472ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:30 | 200 |  227.922241ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:30 | 200 |   137.51217ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:30 | 200 |  176.991731ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:32 | 200 |  1.458218137s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:32 | 200 |   56.194729ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:32 | 200 |  138.109264ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:33 | 200 |  1.312072758s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:33 launchpad ollama[1754]: time=2025-04-13T14:35:33.848-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:35:33 launchpad ollama[1754]: time=2025-04-13T14:35:33.990-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.2 GiB"
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.665-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.666-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.667-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 42123"
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.667-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.667-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.667-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:35:34 launchpad ollama[95820]: INFO [main] build info | build=0 commit="unknown" tid="140547730632704" timestamp=1744580134
+Apr 13 14:35:34 launchpad ollama[95820]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140547730632704" timestamp=1744580134 total_threads=16
+Apr 13 14:35:34 launchpad ollama[95820]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42123" tid="140547730632704" timestamp=1744580134
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:35:34 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:35:34 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:35:34 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:35:34 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.975-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:35:35 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:35:35 launchpad ollama[95820]: INFO [main] model loaded | tid="140547730632704" timestamp=1744580135
+Apr 13 14:35:35 launchpad ollama[1754]: time=2025-04-13T14:35:35.978-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 13 14:35:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:47 | 200 |   13.4673587s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:37:21 launchpad ollama[1754]: time=2025-04-13T14:37:21.968-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="544.4 MiB"
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.669-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9640083456 required="6.2 GiB"
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.669-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.669-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.670-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46639"
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.670-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.670-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.670-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:37:22 launchpad ollama[98703]: INFO [main] build info | build=0 commit="unknown" tid="140693713473536" timestamp=1744580242
+Apr 13 14:37:22 launchpad ollama[98703]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140693713473536" timestamp=1744580242 total_threads=16
+Apr 13 14:37:22 launchpad ollama[98703]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46639" tid="140693713473536" timestamp=1744580242
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.922-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:37:22 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:37:22 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:37:22 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:37:22 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:37:23 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:37:23 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:37:23 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:37:23 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:37:23 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:37:23 launchpad ollama[98703]: INFO [main] model loaded | tid="140693713473536" timestamp=1744580243
+Apr 13 14:37:23 launchpad ollama[1754]: time=2025-04-13T14:37:23.925-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:37:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:24 | 200 |  2.315089317s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:24 | 200 |   166.59957ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:25 | 200 |  980.672758ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:25 | 200 |   85.987893ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:25 | 200 |  166.924182ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:26 | 200 |  975.715946ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:26 | 200 |  166.199779ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:26 | 200 |  168.077027ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:27 | 200 |  771.804659ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:27 | 200 |   85.363072ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:27 | 200 |  168.148106ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:28 | 200 |  1.073632492s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:29 | 200 |   86.120328ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:29 | 200 |  165.991056ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:30 | 200 |  738.237636ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:30 | 200 |   86.471263ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:30 | 200 |  168.129891ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:31 | 200 |  1.029468497s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:31 | 200 |  126.173962ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:31 | 200 |  166.358133ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:32 | 200 |  467.155855ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:32 | 200 |  127.054223ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:32 | 200 |  167.369857ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:32 | 200 |   292.33407ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:32 | 200 |  166.069377ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:33 | 200 |  165.470243ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:33 | 200 |   200.73332ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:33 | 200 |  166.734724ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:33 | 200 |  167.481937ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:34 | 200 |  499.446187ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:34 | 200 |   127.40549ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:34 | 200 |  166.312732ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:35 | 200 |  998.216136ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:35 | 200 |    84.12942ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:35 | 200 |  165.787686ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:36 | 200 |  452.683624ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:36 | 200 |  166.119401ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:36 | 200 |  167.372974ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:36 | 200 |  176.392607ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:36 | 200 |  167.712265ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:37 | 200 |  166.020926ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:37 | 200 |  881.070099ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:37 | 200 |   84.837646ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:38 | 200 |  168.355874ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:38 | 200 |   207.20885ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:38 | 200 |  170.129702ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:38 | 200 |  167.576262ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:39 | 200 |  730.203904ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:39 | 200 |     126.269ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:39 | 200 |  167.988504ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:40 | 200 |  294.312082ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:40 | 200 |  166.463824ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:40 | 200 |   166.23161ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:40 | 200 |  259.133759ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:40 | 200 |  167.989997ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:41 | 200 |    166.0267ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:41 | 200 |  207.488415ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:41 | 200 |  166.309144ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:41 | 200 |  166.291741ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:41 | 200 |   229.73985ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:42 | 200 |   125.80907ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:42 | 200 |  170.979991ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:43 | 200 |  1.271821762s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:43 launchpad ollama[1754]: time=2025-04-13T14:37:43.613-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:37:43 launchpad ollama[1754]: time=2025-04-13T14:37:43.753-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.2 GiB"
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.470-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.471-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.472-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 37701"
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.472-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.472-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.473-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:37:44 launchpad ollama[98819]: INFO [main] build info | build=0 commit="unknown" tid="139752232804352" timestamp=1744580264
+Apr 13 14:37:44 launchpad ollama[98819]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139752232804352" timestamp=1744580264 total_threads=16
+Apr 13 14:37:44 launchpad ollama[98819]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37701" tid="139752232804352" timestamp=1744580264
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:37:44 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:37:44 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:37:44 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:37:44 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.786-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:37:45 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:37:45 launchpad ollama[98819]: INFO [main] model loaded | tid="139752232804352" timestamp=1744580265
+Apr 13 14:37:45 launchpad ollama[1754]: time=2025-04-13T14:37:45.790-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Apr 13 14:38:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:38:01 | 200 |  18.27445792s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.128-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="534.2 MiB"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.830-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9610526720 required="6.2 GiB"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.831-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.831-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.832-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41577"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.832-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.832-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.832-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:39:07 launchpad ollama[103173]: INFO [main] build info | build=0 commit="unknown" tid="140652930228224" timestamp=1744580347
+Apr 13 14:39:07 launchpad ollama[103173]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140652930228224" timestamp=1744580347 total_threads=16
+Apr 13 14:39:07 launchpad ollama[103173]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41577" tid="140652930228224" timestamp=1744580347
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:39:08 launchpad ollama[1754]: time=2025-04-13T14:39:08.083-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:39:08 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:39:08 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:39:08 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:39:08 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:39:08 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:39:08 launchpad ollama[103173]: INFO [main] model loaded | tid="140652930228224" timestamp=1744580348
+Apr 13 14:39:09 launchpad ollama[1754]: time=2025-04-13T14:39:09.087-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:39:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:09 | 200 |  2.329335999s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:09 | 200 |  169.926374ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:10 | 200 |  1.132767102s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:10 | 200 |  130.829894ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:10 | 200 |  172.632389ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:12 | 200 |  1.130264781s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:12 | 200 |   90.205289ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:12 | 200 |  172.187878ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:13 | 200 |  805.233297ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:13 | 200 |   88.955386ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:13 | 200 |  171.941174ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:14 | 200 |  973.317608ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:14 | 200 |   90.134204ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:14 | 200 |  173.875104ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:15 | 200 |  707.457795ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:15 | 200 |   89.656754ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:15 | 200 |  173.209523ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:16 | 200 |  929.248156ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:16 | 200 |  131.001807ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:16 | 200 |  172.770244ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:17 | 200 |  464.639723ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:17 | 200 |  130.222601ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:17 | 200 |  171.325807ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:18 | 200 |  291.187917ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:18 | 200 |  170.831525ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:18 | 200 |  172.566722ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:18 | 200 |  201.040661ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:18 | 200 |  171.786778ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:18 | 200 |  169.581804ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:19 | 200 |  497.590619ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:19 | 200 |   90.045003ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:19 | 200 |  171.106939ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:20 | 200 |  651.564659ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:20 | 200 |   90.527095ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:20 | 200 |  170.741918ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:21 | 200 |  493.252642ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:21 | 200 |  129.900256ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:21 | 200 |  171.470108ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:21 | 200 |  177.588201ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:21 | 200 |  170.188466ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:21 | 200 |  172.863902ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:22 | 200 |  664.346179ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:22 | 200 |   90.436919ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:22 | 200 |  171.593795ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:23 | 200 |   206.75895ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:23 | 200 |  171.253077ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:23 | 200 |  172.073231ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:24 | 200 |  687.208201ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:24 | 200 |   129.81413ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:24 | 200 |  171.588471ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:24 | 200 |  294.572335ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:24 | 200 |  172.780845ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:25 | 200 |  171.341557ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:25 | 200 |  258.210917ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:25 | 200 |  170.748898ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:25 | 200 |   172.21863ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:25 | 200 |  208.696254ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:26 | 200 |  170.215091ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:26 | 200 |  171.483074ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:26 | 200 |  226.776643ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:26 | 200 |  131.068224ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:27 | 200 |  171.629924ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:27 | 200 |   755.27518ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:27 launchpad ollama[1754]: time=2025-04-13T14:39:27.861-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.002-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.2 GiB"
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.700-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.701-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.701-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 43299"
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.702-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.702-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.702-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:39:28 launchpad ollama[103282]: INFO [main] build info | build=0 commit="unknown" tid="140375132950528" timestamp=1744580368
+Apr 13 14:39:28 launchpad ollama[103282]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140375132950528" timestamp=1744580368 total_threads=16
+Apr 13 14:39:28 launchpad ollama[103282]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43299" tid="140375132950528" timestamp=1744580368
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:39:28 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:39:28 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:39:28 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:39:28 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: time=2025-04-13T14:39:29.000-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:39:29 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:39:29 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:39:29 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:39:29 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:39:29 launchpad ollama[103282]: INFO [main] model loaded | tid="140375132950528" timestamp=1744580369
+Apr 13 14:39:30 launchpad ollama[1754]: time=2025-04-13T14:39:30.003-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:39:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:50 | 200 | 22.534664776s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:42:51 launchpad ollama[1754]: time=2025-04-13T14:42:51.522-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="497.8 MiB"
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.227-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9593356288 required="6.2 GiB"
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.227-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.227-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.228-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36823"
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.228-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.228-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.228-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:42:52 launchpad ollama[109510]: INFO [main] build info | build=0 commit="unknown" tid="139823987769344" timestamp=1744580572
+Apr 13 14:42:52 launchpad ollama[109510]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139823987769344" timestamp=1744580572 total_threads=16
+Apr 13 14:42:52 launchpad ollama[109510]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36823" tid="139823987769344" timestamp=1744580572
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.480-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:42:52 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:42:52 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:42:52 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:42:52 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:42:53 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:42:53 launchpad ollama[109510]: INFO [main] model loaded | tid="139823987769344" timestamp=1744580573
+Apr 13 14:42:53 launchpad ollama[1754]: time=2025-04-13T14:42:53.483-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:42:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:53 | 200 |  2.306454143s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:53 | 200 |  165.621165ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:55 | 200 |  1.227310829s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:55 | 200 |  125.410836ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:55 | 200 |  165.467317ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:56 | 200 |  1.224884307s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:56 | 200 |    84.37337ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:56 | 200 |  169.247118ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:57 | 200 |  815.151353ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:57 | 200 |   86.078562ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:57 | 200 |  166.381371ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:58 | 200 |   979.58049ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:59 | 200 |   84.717161ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:59 | 200 |  168.557701ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:59 | 200 |  700.521491ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:00 | 200 |   87.101051ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:00 | 200 |  167.423538ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:01 | 200 |  859.046988ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:01 | 200 |   84.606035ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:01 | 200 |  167.785209ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:01 | 200 |  508.101419ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:01 | 200 |    86.04045ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:02 | 200 |  126.325971ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:02 | 200 |  253.398113ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:02 | 200 |  129.163267ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:02 | 200 |  167.329112ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:02 | 200 |  201.597252ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:03 | 200 |  170.190455ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:03 | 200 |  126.588016ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:03 | 200 |  470.740276ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:03 | 200 |  125.755111ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:03 | 200 |   166.34965ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:04 | 200 |  614.579537ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:04 | 200 |   84.393243ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:04 | 200 |  167.553907ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:05 | 200 |  451.345767ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:05 | 200 |  168.560357ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:05 | 200 |  166.757286ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:05 | 200 |  176.534285ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:05 | 200 |   165.06565ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:06 | 200 |  167.382866ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:06 | 200 |  706.852649ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:06 | 200 |   84.817241ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:07 | 200 |  168.089006ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:07 | 200 |  208.238021ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:07 | 200 |   166.77963ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:07 | 200 |  167.257977ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:08 | 200 |    690.1052ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:08 | 200 |  125.983995ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:08 | 200 |  166.884318ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:08 | 200 |  292.994953ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:09 | 200 |  167.083415ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:09 | 200 |  168.358887ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:09 | 200 |  262.225932ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:09 | 200 |  166.610526ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:09 | 200 |  166.903469ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:10 | 200 |  208.484896ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:10 | 200 |  167.800929ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:10 | 200 |  168.639568ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:10 | 200 |  228.734388ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:11 | 200 |  127.559827ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:11 | 200 |  164.091957ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:12 | 200 |  1.103207366s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:12 launchpad ollama[1754]: time=2025-04-13T14:43:12.364-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:43:12 launchpad ollama[1754]: time=2025-04-13T14:43:12.511-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.2 GiB"
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.184-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.184-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.185-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 34477"
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.186-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.186-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.186-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:43:13 launchpad ollama[109650]: INFO [main] build info | build=0 commit="unknown" tid="139790480424960" timestamp=1744580593
+Apr 13 14:43:13 launchpad ollama[109650]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139790480424960" timestamp=1744580593 total_threads=16
+Apr 13 14:43:13 launchpad ollama[109650]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34477" tid="139790480424960" timestamp=1744580593
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:43:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:43:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:43:13 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:43:13 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.482-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:43:14 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:43:14 launchpad ollama[109650]: INFO [main] model loaded | tid="139790480424960" timestamp=1744580594
+Apr 13 14:43:14 launchpad ollama[1754]: time=2025-04-13T14:43:14.486-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:43:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:44 | 200 | 32.071065531s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.252-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="455.6 MiB"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.987-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9540665344 required="6.2 GiB"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.987-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.988-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.989-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45257"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.989-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.989-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.989-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:47:54 launchpad ollama[119217]: INFO [main] build info | build=0 commit="unknown" tid="139852766101504" timestamp=1744580874
+Apr 13 14:47:54 launchpad ollama[119217]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139852766101504" timestamp=1744580874 total_threads=16
+Apr 13 14:47:54 launchpad ollama[119217]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45257" tid="139852766101504" timestamp=1744580874
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:47:54 launchpad ollama[1754]: time=2025-04-13T14:47:54.240-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:47:54 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:47:54 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:47:54 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:47:54 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:47:54 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:47:55 launchpad ollama[119217]: INFO [main] model loaded | tid="139852766101504" timestamp=1744580875
+Apr 13 14:47:55 launchpad ollama[1754]: time=2025-04-13T14:47:55.245-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:47:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:55 | 200 |  2.336176936s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:55 | 200 |  167.620546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:56 | 200 |  1.206048293s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:56 | 200 |    87.37967ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:57 | 200 |  165.898174ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:58 | 200 |  1.247195317s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:58 | 200 |    86.67248ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:58 | 200 |  165.470134ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:59 | 200 |  817.522098ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:59 | 200 |    83.67877ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:59 | 200 |   170.88948ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:00 | 200 |  1.202322985s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:00 | 200 |   85.512998ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:01 | 200 |  168.123297ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:01 | 200 |  743.007025ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:02 | 200 |   84.748842ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:02 | 200 |  165.219503ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:03 | 200 |  833.215833ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:03 | 200 |   86.181899ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:03 | 200 |  168.187141ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:03 | 200 |  510.773263ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:03 | 200 |   86.410317ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:04 | 200 |  165.704753ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:04 | 200 |  292.947285ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:04 | 200 |  167.087546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:04 | 200 |  167.075167ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:04 | 200 |  205.593958ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:05 | 200 |  126.821211ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:05 | 200 |  167.391458ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:05 | 200 |  469.677802ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:05 | 200 |  127.148157ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:06 | 200 |  168.197813ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:06 | 200 |  613.204189ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:06 | 200 |   87.346538ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:06 | 200 |  167.566321ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:07 | 200 |  455.200156ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:07 | 200 |   168.12777ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:07 | 200 |  166.665977ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:07 | 200 |  178.424922ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:08 | 200 |  169.158281ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:08 | 200 |  165.960777ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:08 | 200 |  708.932235ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:09 | 200 |  125.851304ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:09 | 200 |  166.364736ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:09 | 200 |  209.080447ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:09 | 200 |  166.822624ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:09 | 200 |  166.438521ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:10 | 200 |  734.511083ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:10 | 200 |   44.988457ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:10 | 200 |  126.362646ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:11 | 200 |  253.265742ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:11 | 200 |  124.044282ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:11 | 200 |  125.747611ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:11 | 200 |  220.209076ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:11 | 200 |  167.286234ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:11 | 200 |  167.117513ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:12 | 200 |  168.128894ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:12 | 200 |  126.392959ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:12 | 200 |  167.375091ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:12 | 200 |  187.859807ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:12 | 200 |   85.059234ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:12 | 200 |  165.927161ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:14 | 200 |  1.229156956s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:14 launchpad ollama[1754]: time=2025-04-13T14:48:14.226-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:48:14 launchpad ollama[1754]: time=2025-04-13T14:48:14.392-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.1 GiB"
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.061-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.4 GiB" free_swap="68.9 GiB"
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.062-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.063-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 35195"
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.063-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.063-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.063-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:48:15 launchpad ollama[119359]: INFO [main] build info | build=0 commit="unknown" tid="140598928834560" timestamp=1744580895
+Apr 13 14:48:15 launchpad ollama[119359]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140598928834560" timestamp=1744580895 total_threads=16
+Apr 13 14:48:15 launchpad ollama[119359]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35195" tid="140598928834560" timestamp=1744580895
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:48:15 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:48:15 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:48:15 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:48:15 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.358-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:48:16 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 14:48:16 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 14:48:16 launchpad ollama[119359]: INFO [main] model loaded | tid="140598928834560" timestamp=1744580896
+Apr 13 14:48:16 launchpad ollama[1754]: time=2025-04-13T14:48:16.362-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:48:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:33 | 200 | 19.272204176s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:51:42 launchpad ollama[1754]: time=2025-04-13T14:51:42.475-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="657.8 MiB"
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.181-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9525985280 required="6.2 GiB"
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.181-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.4 GiB" free_swap="68.9 GiB"
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.181-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.182-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46131"
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.182-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.182-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.183-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:51:43 launchpad ollama[125244]: INFO [main] build info | build=0 commit="unknown" tid="139700466872320" timestamp=1744581103
+Apr 13 14:51:43 launchpad ollama[125244]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139700466872320" timestamp=1744581103 total_threads=16
+Apr 13 14:51:43 launchpad ollama[125244]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46131" tid="139700466872320" timestamp=1744581103
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.434-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:51:43 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:51:43 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:51:43 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:51:43 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:51:44 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:51:44 launchpad ollama[125244]: INFO [main] model loaded | tid="139700466872320" timestamp=1744581104
+Apr 13 14:51:44 launchpad ollama[1754]: time=2025-04-13T14:51:44.437-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:51:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:44 | 200 |  2.327866018s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:44 | 200 |  155.836254ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:45 | 200 |  1.110089181s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:45 | 200 |   75.123658ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:46 | 200 |  156.597133ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:47 | 200 |  1.106637975s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:47 | 200 |   73.345248ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:47 | 200 |  155.373546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:48 | 200 |  774.105638ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:48 | 200 |   74.939363ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:48 | 200 |  155.301304ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:49 | 200 |   709.21574ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:49 | 200 |   75.734883ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:49 | 200 |  155.098412ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:50 | 200 |  704.067644ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:50 | 200 |   73.998753ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:50 | 200 |  154.389104ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:51 | 200 |    1.0316162s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:51 | 200 |    73.81543ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:51 | 200 |  157.721209ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:52 | 200 |   508.86493ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:52 | 200 |  114.279137ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:52 | 200 |  156.335117ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:52 | 200 |  295.390059ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:52 | 200 |  155.778386ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:53 | 200 |  155.608959ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:53 | 200 |  200.711905ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:53 | 200 |  155.383567ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:53 | 200 |  155.718212ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:54 | 200 |  418.141277ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:54 | 200 |   73.875119ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:54 | 200 |  155.409292ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:54 | 200 |  616.557785ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:55 | 200 |    74.17406ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:55 | 200 |  157.039781ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:55 | 200 |  453.787568ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:55 | 200 |  156.315774ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:55 | 200 |  157.406733ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:56 | 200 |  177.439879ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:56 | 200 |  158.188064ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:56 | 200 |  156.004753ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:57 | 200 |  709.210402ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:57 | 200 |  116.001729ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:57 | 200 |  156.643015ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:57 | 200 |  207.541271ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:57 | 200 |  157.971427ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:58 | 200 |   157.60671ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:58 | 200 |  736.071452ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:58 | 200 |    33.53547ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:58 | 200 |  158.695871ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:59 | 200 |  254.035204ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:59 | 200 |  115.467262ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:59 | 200 |   157.13492ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:59 | 200 |  224.019162ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:59 | 200 |  115.471842ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:00 | 200 |  156.829552ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:00 | 200 |  170.067603ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:00 | 200 |  115.655003ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:00 | 200 |  158.236461ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:00 | 200 |  229.998766ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:01 | 200 |  115.547989ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:01 | 200 |  155.503699ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:02 | 200 |  1.346159523s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:02 launchpad ollama[1754]: time=2025-04-13T14:52:02.734-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:52:02 launchpad ollama[1754]: time=2025-04-13T14:52:02.879-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.1 GiB"
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.548-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.5 GiB" free_swap="68.9 GiB"
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.549-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.550-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 40581"
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.550-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.550-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.550-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:52:03 launchpad ollama[125352]: INFO [main] build info | build=0 commit="unknown" tid="140122625363968" timestamp=1744581123
+Apr 13 14:52:03 launchpad ollama[125352]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140122625363968" timestamp=1744581123 total_threads=16
+Apr 13 14:52:03 launchpad ollama[125352]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40581" tid="140122625363968" timestamp=1744581123
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:52:03 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:52:03 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:52:03 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:52:03 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.848-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:52:04 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 14:52:04 launchpad ollama[125352]: INFO [main] model loaded | tid="140122625363968" timestamp=1744581124
+Apr 13 14:52:04 launchpad ollama[1754]: time=2025-04-13T14:52:04.852-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:52:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:11 | 200 |  8.940236674s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.106-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="671.6 MiB"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.797-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9534439424 required="6.2 GiB"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.797-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.4 GiB" free_swap="68.9 GiB"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.797-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.798-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33843"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.798-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.798-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.798-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:52:12 launchpad ollama[126368]: INFO [main] build info | build=0 commit="unknown" tid="139974700773376" timestamp=1744581132
+Apr 13 14:52:12 launchpad ollama[126368]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139974700773376" timestamp=1744581132 total_threads=16
+Apr 13 14:52:12 launchpad ollama[126368]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33843" tid="139974700773376" timestamp=1744581132
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:52:13 launchpad ollama[1754]: time=2025-04-13T14:52:13.049-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:52:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:52:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:52:13 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:52:13 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:52:13 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:52:13 launchpad ollama[126368]: INFO [main] model loaded | tid="139974700773376" timestamp=1744581133
+Apr 13 14:52:14 launchpad ollama[1754]: time=2025-04-13T14:52:14.053-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:52:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:16 | 200 |  4.487181651s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.371-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.526-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.526-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.527-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 39539"
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.527-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.527-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.527-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:20:25 launchpad ollama[139322]: INFO [main] build info | build=0 commit="unknown" tid="140461685592064" timestamp=1744582825
+Apr 13 15:20:25 launchpad ollama[139322]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140461685592064" timestamp=1744582825 total_threads=16
+Apr 13 15:20:25 launchpad ollama[139322]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39539" tid="140461685592064" timestamp=1744582825
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 15:20:25 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:20:25 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:20:25 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:20:25 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.831-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:20:26 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 15:20:26 launchpad ollama[139322]: INFO [main] model loaded | tid="140461685592064" timestamp=1744582826
+Apr 13 15:20:26 launchpad ollama[1754]: time=2025-04-13T15:20:26.835-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 13 15:20:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:36 | 200 | 11.167640744s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:20:36 launchpad ollama[1754]: time=2025-04-13T15:20:36.703-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="695.0 MiB"
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.402-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9517006848 required="6.2 GiB"
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.402-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.402-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.403-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38901"
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.403-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.403-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.404-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:20:37 launchpad ollama[141884]: INFO [main] build info | build=0 commit="unknown" tid="139744251215872" timestamp=1744582837
+Apr 13 15:20:37 launchpad ollama[141884]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139744251215872" timestamp=1744582837 total_threads=16
+Apr 13 15:20:37 launchpad ollama[141884]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38901" tid="139744251215872" timestamp=1744582837
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.655-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 15:20:37 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:20:37 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:20:37 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:20:37 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:20:38 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:20:38 launchpad ollama[141884]: INFO [main] model loaded | tid="139744251215872" timestamp=1744582838
+Apr 13 15:20:38 launchpad ollama[1754]: time=2025-04-13T15:20:38.658-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 15:20:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:38 | 200 |  2.335533744s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:39 | 200 |  219.083659ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:40 | 200 |  1.223402429s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:40 | 200 |  136.235196ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:40 | 200 |  218.879614ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:41 | 200 |  1.224914767s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:42 | 200 |  138.740362ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:42 | 200 |  220.989007ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:43 | 200 |  855.910944ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:43 | 200 |   138.90098ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:43 | 200 |  220.597509ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:44 | 200 |  1.151169528s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:44 | 200 |  136.198097ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:45 | 200 |  220.836503ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:45 | 200 |   750.97529ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:45 | 200 |  135.031813ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:46 | 200 |  218.368521ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:46 | 200 |  777.931599ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:47 | 200 |   97.067331ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:47 | 200 |  219.454273ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:47 | 200 |    470.3714ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:48 | 200 |  179.296821ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:48 | 200 |  218.896165ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:48 | 200 |  297.237775ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:48 | 200 |  217.391893ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:48 | 200 |  215.870248ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:49 | 200 |  202.543856ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:49 | 200 |  218.871805ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:49 | 200 |   224.20177ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:50 | 200 |  460.759715ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:50 | 200 |  137.360543ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:50 | 200 |   224.83623ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:51 | 200 |  975.741558ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:51 | 200 |  134.810334ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:51 | 200 |  218.520099ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:52 | 200 |  461.369368ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:52 | 200 |  177.507663ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:52 | 200 |  218.762699ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:52 | 200 |  179.885323ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:53 | 200 |  216.628303ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:53 | 200 |  223.107619ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:54 | 200 |  677.738395ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:54 | 200 |  137.562116ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:54 | 200 |  219.466324ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:54 | 200 |  211.088517ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:54 | 200 |  217.568166ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:55 | 200 |  218.191054ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:55 | 200 |  694.017096ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:55 | 200 |     178.036ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:56 | 200 |  221.769571ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:56 | 200 |  298.858248ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:56 | 200 |  221.898985ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:56 | 200 |  223.632845ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:57 | 200 |  263.110915ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:57 | 200 |   217.82677ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:57 | 200 |  217.428541ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:57 | 200 |  211.145909ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:58 | 200 |  218.083669ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:58 | 200 |  217.529936ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:58 | 200 |  231.392322ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:58 | 200 |  175.889072ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:59 | 200 |  217.337147ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:00 | 200 |  1.162428015s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:00 | 200 |  134.319078ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:00 | 200 |  217.600848ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:01 | 200 |  1.037685909s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:01 | 200 |  135.464764ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:02 | 200 |  218.163688ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:02 | 200 |  477.654323ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:02 | 200 |  139.213181ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:02 | 200 |  219.294195ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:04 | 200 |  1.233782221s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:04 launchpad ollama[1754]: time=2025-04-13T15:21:04.252-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:21:04 launchpad ollama[1754]: time=2025-04-13T15:21:04.402-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.1 GiB"
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.088-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.089-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.090-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 39235"
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.090-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.090-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.090-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:21:05 launchpad ollama[142039]: INFO [main] build info | build=0 commit="unknown" tid="139817837953024" timestamp=1744582865
+Apr 13 15:21:05 launchpad ollama[142039]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139817837953024" timestamp=1744582865 total_threads=16
+Apr 13 15:21:05 launchpad ollama[142039]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39235" tid="139817837953024" timestamp=1744582865
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 15:21:05 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:21:05 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:21:05 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:21:05 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.388-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:21:06 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 15:21:06 launchpad ollama[142039]: INFO [main] model loaded | tid="139817837953024" timestamp=1744582866
+Apr 13 15:21:06 launchpad ollama[1754]: time=2025-04-13T15:21:06.391-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 15:21:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:18 | 200 | 13.944236364s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:24:05 launchpad ollama[1754]: time=2025-04-13T15:24:05.075-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:24:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:12 | 200 |  7.599873801s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:24:12 launchpad ollama[1754]: time=2025-04-13T15:24:12.842-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="628.9 MiB"
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.540-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9514844160 required="6.2 GiB"
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.540-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.540-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.542-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36671"
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.542-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.542-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.542-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:24:13 launchpad ollama[146893]: INFO [main] build info | build=0 commit="unknown" tid="140111561814016" timestamp=1744583053
+Apr 13 15:24:13 launchpad ollama[146893]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140111561814016" timestamp=1744583053 total_threads=16
+Apr 13 15:24:13 launchpad ollama[146893]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36671" tid="140111561814016" timestamp=1744583053
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.794-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 15:24:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:24:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:24:13 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:24:13 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 15:24:14 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:24:14 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 15:24:14 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:24:14 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:24:14 launchpad ollama[146893]: INFO [main] model loaded | tid="140111561814016" timestamp=1744583054
+Apr 13 15:24:14 launchpad ollama[1754]: time=2025-04-13T15:24:14.797-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 15:24:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:14 | 200 |   2.30259646s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:15 | 200 |  177.778542ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:16 | 200 |  1.304098939s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:16 | 200 |   57.662815ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:16 | 200 |  182.232523ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:18 | 200 |  1.355317239s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:18 | 200 |   96.168717ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:18 | 200 |  179.389372ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:19 | 200 |  826.725606ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:19 | 200 |   99.253145ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:19 | 200 |  179.874021ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:20 | 200 |  995.533781ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:20 | 200 |   99.210141ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:20 | 200 |  180.905064ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:21 | 200 |  721.745916ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:21 | 200 |   99.566296ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:21 | 200 |  184.972369ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:22 | 200 |     907.063ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:22 | 200 |    97.61916ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:23 | 200 |  179.471147ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:23 | 200 |  523.254887ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:23 | 200 |   57.297338ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:23 | 200 |  139.995299ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:24 | 200 |  255.328335ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:24 | 200 |  176.788567ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:24 | 200 |  175.849215ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:24 | 200 |  198.014911ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:24 | 200 |  178.582029ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:24 | 200 |  137.825148ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:25 | 200 |  465.989827ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:25 | 200 |  137.951638ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:25 | 200 |  174.259795ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:26 | 200 |  615.923479ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:26 | 200 |   98.989301ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:26 | 200 |  180.735114ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:27 | 200 |  448.715728ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:27 | 200 |  176.590489ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:27 | 200 |  176.606538ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:27 | 200 |  175.912116ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:27 | 200 |  174.230091ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:28 | 200 |  176.343368ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:28 | 200 |  852.401871ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:29 | 200 |   92.890689ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:29 | 200 |  175.123809ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:29 | 200 |  204.803523ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:29 | 200 |  176.510692ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:29 | 200 |  178.906033ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:30 | 200 |  726.008503ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:30 | 200 |  135.877756ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:30 | 200 |  133.923684ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:31 | 200 |  282.130074ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:31 | 200 |  179.448196ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:31 | 200 |  180.601225ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:31 | 200 |  252.863629ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:31 | 200 |   173.38114ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:32 | 200 |  173.141291ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:32 | 200 |   202.71872ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:32 | 200 |  178.541055ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:32 | 200 |  176.393227ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:32 | 200 |  223.083898ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:33 | 200 |  133.447604ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:33 | 200 |  175.584425ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:34 | 200 |  1.485007028s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:34 | 200 |   92.675692ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:35 | 200 |  175.979136ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:36 | 200 |  929.812505ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:36 | 200 |   93.013605ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:36 | 200 |  174.103003ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:36 | 200 |  452.557148ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:36 | 200 |   97.380864ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:37 | 200 |  177.753677ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:38 | 200 |  1.250041522s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:38 launchpad ollama[1754]: time=2025-04-13T15:24:38.398-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:24:38 launchpad ollama[1754]: time=2025-04-13T15:24:38.537-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.1 GiB"
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.201-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.201-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.202-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 36773"
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.202-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.202-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.202-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:24:39 launchpad ollama[147052]: INFO [main] build info | build=0 commit="unknown" tid="140169570066432" timestamp=1744583079
+Apr 13 15:24:39 launchpad ollama[147052]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140169570066432" timestamp=1744583079 total_threads=16
+Apr 13 15:24:39 launchpad ollama[147052]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36773" tid="140169570066432" timestamp=1744583079
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 15:24:39 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:24:39 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:24:39 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:24:39 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.504-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:24:40 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 15:24:40 launchpad ollama[147052]: INFO [main] model loaded | tid="140169570066432" timestamp=1744583080
+Apr 13 15:24:40 launchpad ollama[1754]: time=2025-04-13T15:24:40.507-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 13 15:24:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:52 | 200 | 14.585918113s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:28:16 launchpad ollama[1754]: time=2025-04-13T15:28:16.246-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:28:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:29 | 200 | 13.646273291s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.058-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="632.9 MiB"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.771-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9493807104 required="6.2 GiB"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.771-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.3 GiB" free_swap="68.9 GiB"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.772-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.772-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46661"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.773-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.773-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.773-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:28:30 launchpad ollama[155592]: INFO [main] build info | build=0 commit="unknown" tid="139671107436544" timestamp=1744583310
+Apr 13 15:28:30 launchpad ollama[155592]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139671107436544" timestamp=1744583310 total_threads=16
+Apr 13 15:28:30 launchpad ollama[155592]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46661" tid="139671107436544" timestamp=1744583310
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:28:31 launchpad ollama[1754]: time=2025-04-13T15:28:31.024-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 15:28:31 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:28:31 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:28:31 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:28:31 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:28:31 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:28:31 launchpad ollama[155592]: INFO [main] model loaded | tid="139671107436544" timestamp=1744583311
+Apr 13 15:28:32 launchpad ollama[1754]: time=2025-04-13T15:28:32.028-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 15:28:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:32 | 200 |  2.309975384s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:32 | 200 |  175.763556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:33 | 200 |  1.046365842s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:33 | 200 |  100.228233ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:33 | 200 |  180.818243ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:34 | 200 |  1.048444731s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:34 | 200 |   93.478486ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:35 | 200 |  175.213153ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:35 | 200 |  781.520866ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:35 | 200 |   92.795907ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:36 | 200 |  174.127401ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:37 | 200 |  925.527678ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:37 | 200 |   92.992648ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:37 | 200 |  176.601691ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:38 | 200 |  717.241382ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:38 | 200 |    94.28505ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:38 | 200 |  175.718702ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:39 | 200 |  894.676217ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:39 | 200 |   93.103377ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:39 | 200 |  173.829834ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:40 | 200 |  512.827078ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:40 | 200 |  137.538919ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:40 | 200 |  176.199666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:40 | 200 |   295.36699ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:40 | 200 |  176.711206ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:41 | 200 |  175.252962ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:41 | 200 |  200.886685ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:41 | 200 |  174.826479ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:41 | 200 |  174.554458ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:42 | 200 |   427.65367ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:42 | 200 |   93.555176ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:42 | 200 |  174.998882ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:42 | 200 |  623.251533ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:43 | 200 |   94.898177ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:43 | 200 |  174.574368ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:43 | 200 |  454.208739ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:43 | 200 |  175.775594ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:44 | 200 |  177.891789ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:44 | 200 |  178.793725ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:44 | 200 |  174.754393ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:44 | 200 |  173.874997ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:45 | 200 |  717.899963ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:45 | 200 |   93.361893ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:45 | 200 |   175.09635ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:45 | 200 |  208.630502ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:46 | 200 |  174.736263ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:46 | 200 |  175.305363ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:46 | 200 |  704.026421ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:47 | 200 |  134.881955ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:47 | 200 |   175.89158ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:47 | 200 |   296.27863ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:47 | 200 |  176.761058ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:47 | 200 |  175.372341ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:48 | 200 |  262.516666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:48 | 200 |  175.024969ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:48 | 200 |  174.266167ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:48 | 200 |  210.161869ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:48 | 200 |  174.971625ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:49 | 200 |  176.472332ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:49 | 200 |  230.807281ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:49 | 200 |  134.063488ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:49 | 200 |  174.437775ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:51 | 200 |  1.557817623s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:51 | 200 |   95.108878ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:51 | 200 |  135.748623ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:52 | 200 |  1.093852194s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:52 | 200 |   94.012566ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:53 | 200 |  176.254603ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:53 | 200 |  424.294465ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:53 | 200 |   94.378392ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:53 | 200 |  175.750028ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:54 | 200 |  1.052018571s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:54 launchpad ollama[1754]: time=2025-04-13T15:28:54.889-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.034-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.1 GiB"
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.701-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.3 GiB" free_swap="68.9 GiB"
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.701-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.702-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 46311"
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.702-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.702-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.702-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:28:55 launchpad ollama[155704]: INFO [main] build info | build=0 commit="unknown" tid="139702568947712" timestamp=1744583335
+Apr 13 15:28:55 launchpad ollama[155704]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139702568947712" timestamp=1744583335 total_threads=16
+Apr 13 15:28:55 launchpad ollama[155704]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46311" tid="139702568947712" timestamp=1744583335
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 15:28:55 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:28:55 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:28:55 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:28:55 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: time=2025-04-13T15:28:56.001-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:28:56 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 15:28:56 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 15:28:56 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:28:56 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 15:28:56 launchpad ollama[155704]: INFO [main] model loaded | tid="139702568947712" timestamp=1744583336
+Apr 13 15:28:57 launchpad ollama[1754]: time=2025-04-13T15:28:57.005-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 15:29:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:29:10 | 200 | 15.528294021s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:30:47 launchpad ollama[1754]: time=2025-04-13T15:30:47.444-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:30:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:30:59 | 200 | 12.251677743s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:30:59 launchpad ollama[1754]: time=2025-04-13T15:30:59.866-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="645.2 MiB"
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.562-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9517268992 required="6.2 GiB"
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.562-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.3 GiB" free_swap="68.9 GiB"
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.562-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.564-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44167"
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.564-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.564-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.564-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:31:00 launchpad ollama[162171]: INFO [main] build info | build=0 commit="unknown" tid="140716815843328" timestamp=1744583460
+Apr 13 15:31:00 launchpad ollama[162171]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140716815843328" timestamp=1744583460 total_threads=16
+Apr 13 15:31:00 launchpad ollama[162171]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44167" tid="140716815843328" timestamp=1744583460
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.815-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 15:31:00 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:31:00 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:31:00 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:31:00 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 15:31:01 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:31:01 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 15:31:01 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:31:01 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:31:01 launchpad ollama[162171]: INFO [main] model loaded | tid="140716815843328" timestamp=1744583461
+Apr 13 15:31:01 launchpad ollama[1754]: time=2025-04-13T15:31:01.818-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 15:31:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:01 | 200 |     2.295382s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:02 | 200 |   172.82425ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:03 | 200 |  1.023615991s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:03 | 200 |   92.011696ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:03 | 200 |  172.003511ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:04 | 200 |   1.01405703s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:04 | 200 |    93.34209ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:04 | 200 |  172.470598ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:05 | 200 |  827.630636ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:05 | 200 |   90.832012ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:05 | 200 |  171.703666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:06 | 200 |  977.696344ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:06 | 200 |   90.990238ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:07 | 200 |  171.661044ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:07 | 200 |  701.324264ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:07 | 200 |   90.880075ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:08 | 200 |  171.233842ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:08 | 200 |   725.87934ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:08 | 200 |   90.319721ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:09 | 200 |  171.049921ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:09 | 200 |  510.984656ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:09 | 200 |  132.815278ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:10 | 200 |  171.691165ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:10 | 200 |   293.35695ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:10 | 200 |  174.095509ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:10 | 200 |  172.078875ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:10 | 200 |  200.772509ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:11 | 200 |  172.798378ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:11 | 200 |  171.343274ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:11 | 200 |  435.532499ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:11 | 200 |  131.438092ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:11 | 200 |  175.380635ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:12 | 200 |  615.994352ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:12 | 200 |   91.321108ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:12 | 200 |  171.567546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:13 | 200 |  493.008798ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:13 | 200 |  132.712482ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:13 | 200 |  130.858442ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:13 | 200 |  178.975808ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:14 | 200 |  171.302138ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:14 | 200 |  171.519109ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:14 | 200 |  676.712644ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:14 | 200 |   89.402666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:15 | 200 |  171.906299ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:15 | 200 |  207.864117ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:15 | 200 |  172.189789ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:15 | 200 |  171.544534ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:16 | 200 |  735.047485ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:16 | 200 |  130.804605ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:16 | 200 |   171.68224ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:17 | 200 |  292.339542ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:17 | 200 |  171.780638ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:17 | 200 |  171.663376ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:17 | 200 |  265.501974ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:17 | 200 |  174.116283ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:18 | 200 |  172.011446ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:18 | 200 |  209.780381ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:18 | 200 |  172.116588ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:18 | 200 |  171.302487ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:18 | 200 |  228.867021ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:19 | 200 |  130.427285ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:19 | 200 |  172.539627ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:20 | 200 |   939.93145ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:20 | 200 |   90.696951ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:20 | 200 |  172.931688ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:21 | 200 |  1.002579344s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:21 | 200 |   91.129161ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:21 | 200 |  171.674444ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:22 | 200 |  460.472677ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:22 | 200 |     90.9708ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:22 | 200 |  172.491782ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:23 | 200 |  1.015437557s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:23 launchpad ollama[1754]: time=2025-04-13T15:31:23.688-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:31:23 launchpad ollama[1754]: time=2025-04-13T15:31:23.836-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.1 GiB"
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.506-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.3 GiB" free_swap="68.9 GiB"
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.506-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.507-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 33047"
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.507-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.507-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.507-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:31:24 launchpad ollama[162284]: INFO [main] build info | build=0 commit="unknown" tid="139816642605056" timestamp=1744583484
+Apr 13 15:31:24 launchpad ollama[162284]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139816642605056" timestamp=1744583484 total_threads=16
+Apr 13 15:31:24 launchpad ollama[162284]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33047" tid="139816642605056" timestamp=1744583484
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 15:31:24 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:31:24 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:31:24 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:31:24 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.806-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:31:25 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 15:31:25 launchpad ollama[162284]: INFO [main] model loaded | tid="139816642605056" timestamp=1744583485
+Apr 13 15:31:25 launchpad ollama[1754]: time=2025-04-13T15:31:25.809-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 15:31:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:39 | 200 | 16.091503946s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:37:14 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 15:37:14 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 15:37:14 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 15:37:14 launchpad systemd[1]: ollama.service: Consumed 56min 8.863s CPU time, 12.6G memory peak, 11.4G read from disk, 508.1M written to disk, 19M incoming IP traffic, 30.2M outgoing IP traffic.
+-- Boot a78f12391ecd4729a571ed50e2e04cee --
+Apr 13 15:37:56 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 15:37:56 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 15:37:56 launchpad ollama[1756]: 2025/04/13 15:37:56 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 15:37:56 launchpad ollama[1756]: time=2025-04-13T15:37:56.480-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 15:37:56 launchpad ollama[1756]: time=2025-04-13T15:37:56.490-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 15:37:56 launchpad ollama[1756]: time=2025-04-13T15:37:56.491-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 15:37:56 launchpad ollama[1756]: time=2025-04-13T15:37:56.492-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3624026559/runners
+Apr 13 15:37:59 launchpad ollama[1756]: time=2025-04-13T15:37:59.472-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 13 15:37:59 launchpad ollama[1756]: time=2025-04-13T15:37:59.473-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 15:37:59 launchpad ollama[1756]: time=2025-04-13T15:37:59.473-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 15:37:59 launchpad ollama[1756]: time=2025-04-13T15:37:59.474-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 15:37:59 launchpad ollama[1756]: time=2025-04-13T15:37:59.474-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 15:37:59 launchpad ollama[1756]: time=2025-04-13T15:37:59.713-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 15:39:10 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:39:10 | 200 |    3.758748ms |       127.0.0.1 | GET      "/api/tags"
+Apr 13 15:39:11 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:39:11 | 200 |     543.705µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 15:39:11 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:39:11 | 200 |     767.313µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 15:39:18 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:39:18 | 200 |      36.682µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.911-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10312744960 required="6.2 GiB"
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.911-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.4 GiB" free_swap="68.9 GiB"
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.911-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.913-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3624026559/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38011"
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.913-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.913-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.914-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:41:59 launchpad ollama[5628]: INFO [main] build info | build=0 commit="unknown" tid="140634880040960" timestamp=1744584119
+Apr 13 15:41:59 launchpad ollama[5628]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140634880040960" timestamp=1744584119 total_threads=16
+Apr 13 15:41:59 launchpad ollama[5628]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38011" tid="140634880040960" timestamp=1744584119
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:41:59 launchpad ollama[1756]: time=2025-04-13T15:41:59.165-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_vocab: special tokens cache size = 256
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: arch             = llama
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: vocab type       = BPE
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_merges         = 280147
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_embd           = 4096
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_layer          = 32
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_head           = 32
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_gqa            = 4
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_ff             = 14336
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: rope type        = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: model type       = 8B
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: max token length = 256
+Apr 13 15:41:59 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:41:59 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:41:59 launchpad ollama[1756]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:41:59 launchpad ollama[1756]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 15:42:04 launchpad ollama[1756]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 15:42:04 launchpad ollama[1756]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:42:04 launchpad ollama[1756]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 15:42:04 launchpad ollama[1756]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 15:42:04 launchpad ollama[1756]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:42:05 launchpad ollama[1756]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:42:05 launchpad ollama[5628]: INFO [main] model loaded | tid="140634880040960" timestamp=1744584125
+Apr 13 15:42:05 launchpad ollama[1756]: time=2025-04-13T15:42:05.435-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.52 seconds"
+Apr 13 15:42:05 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:42:05 | 200 |  6.885074904s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:42:11 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:42:11 | 200 |  195.666683ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:42:12 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:42:12 | 200 |  1.342602523s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:42:12 launchpad ollama[1756]: time=2025-04-13T15:42:12.609-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:42:12 launchpad ollama[1756]: time=2025-04-13T15:42:12.754-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.9 GiB"
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.454-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10312744960 required="9.2 GiB"
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.454-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.0 GiB" free_swap="68.9 GiB"
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.454-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.455-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3624026559/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 38247"
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.455-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.455-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.456-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:42:13 launchpad ollama[5741]: INFO [main] build info | build=0 commit="unknown" tid="140651021824000" timestamp=1744584133
+Apr 13 15:42:13 launchpad ollama[5741]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140651021824000" timestamp=1744584133 total_threads=16
+Apr 13 15:42:13 launchpad ollama[5741]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38247" tid="140651021824000" timestamp=1744584133
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: arch             = llama
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_head           = 40
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: rope type        = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: model type       = 13B
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: max token length = 48
+Apr 13 15:42:13 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:42:13 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:42:13 launchpad ollama[1756]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:42:13 launchpad ollama[1756]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.706-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:42:21 launchpad ollama[1756]: llm_load_tensors: offloading 40 repeating layers to GPU
+Apr 13 15:42:21 launchpad ollama[1756]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:42:21 launchpad ollama[1756]: llm_load_tensors: offloaded 41/41 layers to GPU
+Apr 13 15:42:21 launchpad ollama[1756]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Apr 13 15:42:21 launchpad ollama[1756]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:42:22 launchpad ollama[1756]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:42:22 launchpad ollama[5741]: INFO [main] model loaded | tid="140651021824000" timestamp=1744584142
+Apr 13 15:42:22 launchpad ollama[1756]: time=2025-04-13T15:42:22.231-07:00 level=INFO source=server.go:626 msg="llama runner started in 8.78 seconds"
+Apr 13 15:42:37 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:42:37 | 200 | 24.657033543s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:42:37 launchpad ollama[1756]: time=2025-04-13T15:42:37.331-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:42:38 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:42:38 | 200 |  1.426392962s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:42:38 launchpad ollama[1756]: time=2025-04-13T15:42:38.799-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:42:43 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:42:43 | 200 |  5.189497678s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:45:50 launchpad ollama[1756]: time=2025-04-13T15:45:50.385-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="905.0 MiB"
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.068-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10316021760 required="6.2 GiB"
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.068-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.069-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.069-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3624026559/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45013"
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.070-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.070-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.070-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:45:51 launchpad ollama[6495]: INFO [main] build info | build=0 commit="unknown" tid="139679094628352" timestamp=1744584351
+Apr 13 15:45:51 launchpad ollama[6495]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139679094628352" timestamp=1744584351 total_threads=16
+Apr 13 15:45:51 launchpad ollama[6495]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45013" tid="139679094628352" timestamp=1744584351
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.321-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_vocab: special tokens cache size = 256
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: arch             = llama
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: vocab type       = BPE
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_merges         = 280147
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_embd           = 4096
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_layer          = 32
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_head           = 32
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_gqa            = 4
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_ff             = 14336
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: rope type        = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: model type       = 8B
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: max token length = 256
+Apr 13 15:45:51 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:45:51 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:45:51 launchpad ollama[1756]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:45:51 launchpad ollama[1756]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:45:52 launchpad ollama[1756]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:45:52 launchpad ollama[6495]: INFO [main] model loaded | tid="139679094628352" timestamp=1744584352
+Apr 13 15:45:52 launchpad ollama[1756]: time=2025-04-13T15:45:52.324-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 15:45:52 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:45:52 | 200 |   2.29403766s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:45:52 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:45:52 | 200 |  175.115602ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:45:54 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:45:54 | 200 |  1.481339439s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:45:54 launchpad ollama[1756]: time=2025-04-13T15:45:54.202-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:45:54 launchpad ollama[1756]: time=2025-04-13T15:45:54.343-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.9 GiB"
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.041-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10316021760 required="9.2 GiB"
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.042-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.042-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.043-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3624026559/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 41581"
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.043-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.043-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.044-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:45:55 launchpad ollama[6524]: INFO [main] build info | build=0 commit="unknown" tid="140406867972096" timestamp=1744584355
+Apr 13 15:45:55 launchpad ollama[6524]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140406867972096" timestamp=1744584355 total_threads=16
+Apr 13 15:45:55 launchpad ollama[6524]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41581" tid="140406867972096" timestamp=1744584355
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: arch             = llama
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_head           = 40
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: rope type        = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: model type       = 13B
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: max token length = 48
+Apr 13 15:45:55 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:45:55 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:45:55 launchpad ollama[1756]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:45:55 launchpad ollama[1756]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.341-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_tensors: offloading 40 repeating layers to GPU
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_tensors: offloaded 41/41 layers to GPU
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:45:56 launchpad ollama[1756]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:45:56 launchpad ollama[6524]: INFO [main] model loaded | tid="140406867972096" timestamp=1744584356
+Apr 13 15:45:56 launchpad ollama[1756]: time=2025-04-13T15:45:56.344-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 15:46:01 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:46:01 | 200 |  7.408036594s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:55:41 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 15:55:41 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 15:55:41 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 15:55:41 launchpad systemd[1]: ollama.service: Consumed 41.005s CPU time, 12.4G memory peak, 11.4G read from disk, 508.1M written to disk.
+-- Boot 20128fdcbaa2418e9d2aceed164d3c06 --
+Apr 13 15:56:22 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 15:56:22 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 15:56:22 launchpad ollama[1760]: 2025/04/13 15:56:22 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 15:56:22 launchpad ollama[1760]: time=2025-04-13T15:56:22.165-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 15:56:22 launchpad ollama[1760]: time=2025-04-13T15:56:22.175-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 15:56:22 launchpad ollama[1760]: time=2025-04-13T15:56:22.176-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 15:56:22 launchpad ollama[1760]: time=2025-04-13T15:56:22.177-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1080162874/runners
+Apr 13 15:56:25 launchpad ollama[1760]: time=2025-04-13T15:56:25.347-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 13 15:56:25 launchpad ollama[1760]: time=2025-04-13T15:56:25.348-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 15:56:25 launchpad ollama[1760]: time=2025-04-13T15:56:25.348-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 15:56:25 launchpad ollama[1760]: time=2025-04-13T15:56:25.349-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 15:56:25 launchpad ollama[1760]: time=2025-04-13T15:56:25.349-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 15:56:25 launchpad ollama[1760]: time=2025-04-13T15:56:25.567-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 16:00:34 launchpad ollama[1760]: [GIN] 2025/04/13 - 16:00:34 | 200 |    3.583045ms |       127.0.0.1 | GET      "/api/tags"
+Apr 13 16:00:34 launchpad ollama[1760]: [GIN] 2025/04/13 - 16:00:34 | 200 |     658.458µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 16:00:34 launchpad ollama[1760]: [GIN] 2025/04/13 - 16:00:34 | 200 |     723.653µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 16:16:01 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 16:16:01 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 16:16:01 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 16:16:01 launchpad systemd[1]: ollama.service: Consumed 3.644s CPU time, 786.5M memory peak, 234.4M read from disk, 508.1M written to disk, 1.3K incoming IP traffic, 6.2K outgoing IP traffic.
+-- Boot b8bbb9ea00c44553bbb55609f6f047a8 --
+Apr 13 16:16:40 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 16:16:40 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 16:16:41 launchpad ollama[1764]: 2025/04/13 16:16:41 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 16:16:41 launchpad ollama[1764]: time=2025-04-13T16:16:41.045-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 16:16:41 launchpad ollama[1764]: time=2025-04-13T16:16:41.055-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 16:16:41 launchpad ollama[1764]: time=2025-04-13T16:16:41.056-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 16:16:41 launchpad ollama[1764]: time=2025-04-13T16:16:41.057-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama750748247/runners
+Apr 13 16:16:44 launchpad ollama[1764]: time=2025-04-13T16:16:44.027-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 13 16:16:44 launchpad ollama[1764]: time=2025-04-13T16:16:44.028-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 16:16:44 launchpad ollama[1764]: time=2025-04-13T16:16:44.028-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:16:44 launchpad ollama[1764]: time=2025-04-13T16:16:44.029-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:16:44 launchpad ollama[1764]: time=2025-04-13T16:16:44.029-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:16:44 launchpad ollama[1764]: time=2025-04-13T16:16:44.261-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 16:30:08 launchpad ollama[1764]: [GIN] 2025/04/13 - 16:30:08 | 200 |    3.547667ms |       127.0.0.1 | GET      "/api/tags"
+Apr 13 16:30:10 launchpad ollama[1764]: [GIN] 2025/04/13 - 16:30:10 | 200 |      688.84µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 16:30:10 launchpad ollama[1764]: [GIN] 2025/04/13 - 16:30:10 | 200 |     748.102µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 16:51:31 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 16:51:31 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 16:51:31 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 16:51:31 launchpad systemd[1]: ollama.service: Consumed 3.450s CPU time, 786.7M memory peak, 234.4M read from disk, 508.1M written to disk, 1.3K incoming IP traffic, 6.2K outgoing IP traffic.
+-- Boot dee112a04b9e4f0a8d0dead2f8cfb4fc --
+Apr 13 16:52:12 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 16:52:12 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 16:52:12 launchpad ollama[1762]: 2025/04/13 16:52:12 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 16:52:12 launchpad ollama[1762]: time=2025-04-13T16:52:12.182-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 16:52:12 launchpad ollama[1762]: time=2025-04-13T16:52:12.193-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 16:52:12 launchpad ollama[1762]: time=2025-04-13T16:52:12.194-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 16:52:12 launchpad ollama[1762]: time=2025-04-13T16:52:12.196-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1951053274/runners
+Apr 13 16:52:15 launchpad ollama[1762]: time=2025-04-13T16:52:15.176-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Apr 13 16:52:15 launchpad ollama[1762]: time=2025-04-13T16:52:15.178-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 16:52:15 launchpad ollama[1762]: time=2025-04-13T16:52:15.178-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:52:15 launchpad ollama[1762]: time=2025-04-13T16:52:15.178-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:52:15 launchpad ollama[1762]: time=2025-04-13T16:52:15.178-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:52:15 launchpad ollama[1762]: time=2025-04-13T16:52:15.405-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 16:54:47 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 16:54:47 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 16:54:47 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 16:54:47 launchpad systemd[1]: ollama.service: Consumed 3.423s CPU time, 787.1M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 43ecbe2f41fb4b8da3b592399ebf60e2 --
+Apr 13 16:55:26 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 16:55:27 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 16:55:27 launchpad ollama[1758]: 2025/04/13 16:55:27 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 16:55:27 launchpad ollama[1758]: time=2025-04-13T16:55:27.146-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 16:55:27 launchpad ollama[1758]: time=2025-04-13T16:55:27.155-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 16:55:27 launchpad ollama[1758]: time=2025-04-13T16:55:27.156-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 16:55:27 launchpad ollama[1758]: time=2025-04-13T16:55:27.158-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama841105555/runners
+Apr 13 16:55:30 launchpad ollama[1758]: time=2025-04-13T16:55:30.129-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 13 16:55:30 launchpad ollama[1758]: time=2025-04-13T16:55:30.130-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 16:55:30 launchpad ollama[1758]: time=2025-04-13T16:55:30.130-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:55:30 launchpad ollama[1758]: time=2025-04-13T16:55:30.131-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:55:30 launchpad ollama[1758]: time=2025-04-13T16:55:30.131-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:55:30 launchpad ollama[1758]: time=2025-04-13T16:55:30.358-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 17:28:51 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 17:28:51 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 17:28:51 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 17:28:51 launchpad systemd[1]: ollama.service: Consumed 3.439s CPU time, 786.8M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 5268246c82d140dd8cf2d9bfdbe01090 --
+Apr 13 17:29:45 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 17:29:45 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 17:29:45 launchpad ollama[1771]: 2025/04/13 17:29:45 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 17:29:45 launchpad ollama[1771]: time=2025-04-13T17:29:45.525-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 17:29:45 launchpad ollama[1771]: time=2025-04-13T17:29:45.536-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 17:29:45 launchpad ollama[1771]: time=2025-04-13T17:29:45.537-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 17:29:45 launchpad ollama[1771]: time=2025-04-13T17:29:45.539-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama781222968/runners
+Apr 13 17:29:48 launchpad ollama[1771]: time=2025-04-13T17:29:48.465-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 13 17:29:48 launchpad ollama[1771]: time=2025-04-13T17:29:48.467-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 17:29:48 launchpad ollama[1771]: time=2025-04-13T17:29:48.467-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 17:29:48 launchpad ollama[1771]: time=2025-04-13T17:29:48.467-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 17:29:48 launchpad ollama[1771]: time=2025-04-13T17:29:48.467-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 17:29:50 launchpad ollama[1771]: time=2025-04-13T17:29:50.235-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 13 17:30:26 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 17:30:26 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 17:30:26 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 17:30:26 launchpad systemd[1]: ollama.service: Consumed 5.191s CPU time, 789.7M memory peak, 237.4M read from disk, 508.1M written to disk.
+-- Boot e268674044bb476e80f6a4fbb0f4cc1b --
+Apr 13 17:31:07 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 17:31:07 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 17:31:07 launchpad ollama[1756]: 2025/04/13 17:31:07 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 17:31:07 launchpad ollama[1756]: time=2025-04-13T17:31:07.517-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 17:31:07 launchpad ollama[1756]: time=2025-04-13T17:31:07.526-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 17:31:07 launchpad ollama[1756]: time=2025-04-13T17:31:07.527-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 17:31:07 launchpad ollama[1756]: time=2025-04-13T17:31:07.529-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama621053729/runners
+Apr 13 17:31:10 launchpad ollama[1756]: time=2025-04-13T17:31:10.579-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 13 17:31:10 launchpad ollama[1756]: time=2025-04-13T17:31:10.580-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 17:31:10 launchpad ollama[1756]: time=2025-04-13T17:31:10.581-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 17:31:10 launchpad ollama[1756]: time=2025-04-13T17:31:10.581-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 17:31:10 launchpad ollama[1756]: time=2025-04-13T17:31:10.581-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 17:31:10 launchpad ollama[1756]: time=2025-04-13T17:31:10.812-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 18:03:15 launchpad ollama[1756]: [GIN] 2025/04/13 - 18:03:15 | 200 |    3.668785ms |       127.0.0.1 | GET      "/api/tags"
+Apr 13 18:03:16 launchpad ollama[1756]: [GIN] 2025/04/13 - 18:03:16 | 200 |     598.154µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 18:03:16 launchpad ollama[1756]: [GIN] 2025/04/13 - 18:03:16 | 200 |     611.121µs |       127.0.0.1 | GET      "/api/version"
+Apr 15 14:20:41 launchpad ollama[1756]: [GIN] 2025/04/15 - 14:20:41 | 200 |     687.345µs |       127.0.0.1 | GET      "/api/tags"
+Apr 15 14:20:41 launchpad ollama[1756]: [GIN] 2025/04/15 - 14:20:41 | 200 |      49.147µs |       127.0.0.1 | GET      "/api/version"
+Apr 15 15:36:17 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 15:36:17 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 15:36:17 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 15:36:17 launchpad systemd[1]: ollama.service: Consumed 4.198s CPU time, 786.7M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 431c14e51fac4578a0e4f6280b7e5e30 --
+Apr 15 15:37:06 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 15:37:06 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 15:37:06 launchpad ollama[1752]: 2025/04/15 15:37:06 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 15:37:06 launchpad ollama[1752]: time=2025-04-15T15:37:06.397-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 15:37:06 launchpad ollama[1752]: time=2025-04-15T15:37:06.408-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 15:37:06 launchpad ollama[1752]: time=2025-04-15T15:37:06.409-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 15:37:06 launchpad ollama[1752]: time=2025-04-15T15:37:06.411-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3612536661/runners
+Apr 15 15:37:09 launchpad ollama[1752]: time=2025-04-15T15:37:09.397-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 15:37:09 launchpad ollama[1752]: time=2025-04-15T15:37:09.397-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 15:37:09 launchpad ollama[1752]: time=2025-04-15T15:37:09.398-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:37:09 launchpad ollama[1752]: time=2025-04-15T15:37:09.398-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:37:09 launchpad ollama[1752]: time=2025-04-15T15:37:09.398-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:37:17 launchpad ollama[1752]: time=2025-04-15T15:37:17.499-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 15 15:37:38 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 15:37:39 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 15:37:39 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 15:37:39 launchpad systemd[1]: ollama.service: Consumed 11.360s CPU time, 786.4M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot 25edadc520564711bb6293d77d6db8fe --
+Apr 15 15:38:19 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 15:38:19 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 15:38:19 launchpad ollama[1756]: 2025/04/15 15:38:19 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 15:38:19 launchpad ollama[1756]: time=2025-04-15T15:38:19.551-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 15:38:19 launchpad ollama[1756]: time=2025-04-15T15:38:19.559-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 15:38:19 launchpad ollama[1756]: time=2025-04-15T15:38:19.561-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 15:38:19 launchpad ollama[1756]: time=2025-04-15T15:38:19.562-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama251922655/runners
+Apr 15 15:38:22 launchpad ollama[1756]: time=2025-04-15T15:38:22.543-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 15:38:22 launchpad ollama[1756]: time=2025-04-15T15:38:22.543-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 15:38:22 launchpad ollama[1756]: time=2025-04-15T15:38:22.544-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:38:22 launchpad ollama[1756]: time=2025-04-15T15:38:22.544-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:38:22 launchpad ollama[1756]: time=2025-04-15T15:38:22.544-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:38:22 launchpad ollama[1756]: time=2025-04-15T15:38:22.775-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 15:56:27 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 15:56:28 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 15:56:28 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 15:56:28 launchpad systemd[1]: ollama.service: Consumed 3.445s CPU time, 786.9M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot f683eeafa22e40de8bb52c8456b74fc9 --
+Apr 15 15:58:28 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 15:58:28 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 15:58:28 launchpad ollama[1758]: 2025/04/15 15:58:28 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 15:58:28 launchpad ollama[1758]: time=2025-04-15T15:58:28.980-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 15:58:28 launchpad ollama[1758]: time=2025-04-15T15:58:28.987-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 15:58:28 launchpad ollama[1758]: time=2025-04-15T15:58:28.988-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 15:58:28 launchpad ollama[1758]: time=2025-04-15T15:58:28.989-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1509245173/runners
+Apr 15 15:58:31 launchpad ollama[1758]: time=2025-04-15T15:58:31.996-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 15 15:58:31 launchpad ollama[1758]: time=2025-04-15T15:58:31.998-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 15:58:31 launchpad ollama[1758]: time=2025-04-15T15:58:31.998-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:58:31 launchpad ollama[1758]: time=2025-04-15T15:58:31.998-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:58:31 launchpad ollama[1758]: time=2025-04-15T15:58:31.998-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:58:32 launchpad ollama[1758]: time=2025-04-15T15:58:32.224-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 16:20:25 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 16:20:25 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 16:20:25 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 16:20:25 launchpad systemd[1]: ollama.service: Consumed 3.464s CPU time, 786.9M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot e33af310e153423592a7dc42621bfb6b --
+Apr 15 16:21:05 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 16:21:05 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 16:21:05 launchpad ollama[1751]: 2025/04/15 16:21:05 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 16:21:05 launchpad ollama[1751]: time=2025-04-15T16:21:05.756-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 16:21:05 launchpad ollama[1751]: time=2025-04-15T16:21:05.764-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 16:21:05 launchpad ollama[1751]: time=2025-04-15T16:21:05.764-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 16:21:05 launchpad ollama[1751]: time=2025-04-15T16:21:05.766-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama134506118/runners
+Apr 15 16:21:08 launchpad ollama[1751]: time=2025-04-15T16:21:08.748-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 16:21:08 launchpad ollama[1751]: time=2025-04-15T16:21:08.749-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 16:21:08 launchpad ollama[1751]: time=2025-04-15T16:21:08.749-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 16:21:08 launchpad ollama[1751]: time=2025-04-15T16:21:08.750-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 16:21:08 launchpad ollama[1751]: time=2025-04-15T16:21:08.750-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 16:21:08 launchpad ollama[1751]: time=2025-04-15T16:21:08.959-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 16:41:30 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 16:41:30 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 16:41:30 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 16:41:30 launchpad systemd[1]: ollama.service: Consumed 3.412s CPU time, 787.8M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot ff2aa01d05b342759a68d202eb1e0194 --
+Apr 15 16:42:11 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 16:42:11 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 16:42:11 launchpad ollama[1765]: 2025/04/15 16:42:11 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 16:42:11 launchpad ollama[1765]: time=2025-04-15T16:42:11.520-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 16:42:11 launchpad ollama[1765]: time=2025-04-15T16:42:11.528-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 16:42:11 launchpad ollama[1765]: time=2025-04-15T16:42:11.530-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 16:42:11 launchpad ollama[1765]: time=2025-04-15T16:42:11.532-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3944000283/runners
+Apr 15 16:42:14 launchpad ollama[1765]: time=2025-04-15T16:42:14.569-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 15 16:42:14 launchpad ollama[1765]: time=2025-04-15T16:42:14.570-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 16:42:14 launchpad ollama[1765]: time=2025-04-15T16:42:14.570-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 16:42:14 launchpad ollama[1765]: time=2025-04-15T16:42:14.570-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 16:42:14 launchpad ollama[1765]: time=2025-04-15T16:42:14.570-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 16:42:14 launchpad ollama[1765]: time=2025-04-15T16:42:14.788-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 17:11:55 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 17:11:56 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 17:11:56 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 17:11:56 launchpad systemd[1]: ollama.service: Consumed 3.484s CPU time, 787M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 4d2c224ef067436aaa53e578ed6a3c23 --
+Apr 15 17:12:35 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 17:12:35 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 17:12:35 launchpad ollama[1756]: 2025/04/15 17:12:35 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 17:12:35 launchpad ollama[1756]: time=2025-04-15T17:12:35.952-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 17:12:35 launchpad ollama[1756]: time=2025-04-15T17:12:35.960-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 17:12:35 launchpad ollama[1756]: time=2025-04-15T17:12:35.961-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 17:12:35 launchpad ollama[1756]: time=2025-04-15T17:12:35.964-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama719220119/runners
+Apr 15 17:12:38 launchpad ollama[1756]: time=2025-04-15T17:12:38.950-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 17:12:38 launchpad ollama[1756]: time=2025-04-15T17:12:38.951-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 17:12:38 launchpad ollama[1756]: time=2025-04-15T17:12:38.951-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:12:38 launchpad ollama[1756]: time=2025-04-15T17:12:38.951-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:12:38 launchpad ollama[1756]: time=2025-04-15T17:12:38.951-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:12:39 launchpad ollama[1756]: time=2025-04-15T17:12:39.171-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 17:29:03 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 17:29:03 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 17:29:03 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 17:29:03 launchpad systemd[1]: ollama.service: Consumed 3.425s CPU time, 786.6M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 92afaf03a1c241d4b4e10462241fd95d --
+Apr 15 17:29:43 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 17:29:44 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 17:29:44 launchpad ollama[1753]: 2025/04/15 17:29:44 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 17:29:44 launchpad ollama[1753]: time=2025-04-15T17:29:44.267-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 17:29:44 launchpad ollama[1753]: time=2025-04-15T17:29:44.275-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 17:29:44 launchpad ollama[1753]: time=2025-04-15T17:29:44.276-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 17:29:44 launchpad ollama[1753]: time=2025-04-15T17:29:44.278-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama108880241/runners
+Apr 15 17:29:47 launchpad ollama[1753]: time=2025-04-15T17:29:47.320-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 15 17:29:47 launchpad ollama[1753]: time=2025-04-15T17:29:47.321-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 17:29:47 launchpad ollama[1753]: time=2025-04-15T17:29:47.321-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:29:47 launchpad ollama[1753]: time=2025-04-15T17:29:47.321-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:29:47 launchpad ollama[1753]: time=2025-04-15T17:29:47.321-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:29:47 launchpad ollama[1753]: time=2025-04-15T17:29:47.546-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 17:36:30 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 17:36:30 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 17:36:30 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 17:36:30 launchpad systemd[1]: ollama.service: Consumed 3.480s CPU time, 787.2M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot d88d7d2304fb4d7b86d2ef0adda036df --
+Apr 15 17:37:12 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 17:37:12 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 17:37:12 launchpad ollama[1755]: 2025/04/15 17:37:12 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 17:37:12 launchpad ollama[1755]: time=2025-04-15T17:37:12.496-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 17:37:12 launchpad ollama[1755]: time=2025-04-15T17:37:12.505-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 17:37:12 launchpad ollama[1755]: time=2025-04-15T17:37:12.506-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 17:37:12 launchpad ollama[1755]: time=2025-04-15T17:37:12.507-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2772590638/runners
+Apr 15 17:37:15 launchpad ollama[1755]: time=2025-04-15T17:37:15.564-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 15 17:37:15 launchpad ollama[1755]: time=2025-04-15T17:37:15.564-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 17:37:15 launchpad ollama[1755]: time=2025-04-15T17:37:15.565-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:37:15 launchpad ollama[1755]: time=2025-04-15T17:37:15.565-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:37:15 launchpad ollama[1755]: time=2025-04-15T17:37:15.565-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:37:15 launchpad ollama[1755]: time=2025-04-15T17:37:15.780-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 18:42:18 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 18:42:18 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 18:42:18 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 18:42:18 launchpad systemd[1]: ollama.service: Consumed 3.517s CPU time, 787.6M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot adf2dd4a7a96414d875044bff7cddcef --
+Apr 15 18:42:59 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 18:42:59 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 18:42:59 launchpad ollama[1766]: 2025/04/15 18:42:59 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 18:42:59 launchpad ollama[1766]: time=2025-04-15T18:42:59.304-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 18:42:59 launchpad ollama[1766]: time=2025-04-15T18:42:59.312-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 18:42:59 launchpad ollama[1766]: time=2025-04-15T18:42:59.313-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 18:42:59 launchpad ollama[1766]: time=2025-04-15T18:42:59.315-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3858422155/runners
+Apr 15 18:43:02 launchpad ollama[1766]: time=2025-04-15T18:43:02.348-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 15 18:43:02 launchpad ollama[1766]: time=2025-04-15T18:43:02.349-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 18:43:02 launchpad ollama[1766]: time=2025-04-15T18:43:02.349-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 18:43:02 launchpad ollama[1766]: time=2025-04-15T18:43:02.349-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 18:43:02 launchpad ollama[1766]: time=2025-04-15T18:43:02.349-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 18:43:02 launchpad ollama[1766]: time=2025-04-15T18:43:02.568-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 19:05:37 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 19:05:37 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 19:05:38 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 19:05:38 launchpad systemd[1]: ollama.service: Consumed 3.479s CPU time, 787M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot 56deafc783e24c26bad044f3835079cc --
+Apr 15 19:06:17 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 19:06:17 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 19:06:18 launchpad ollama[1751]: 2025/04/15 19:06:18 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 19:06:18 launchpad ollama[1751]: time=2025-04-15T19:06:18.103-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 19:06:18 launchpad ollama[1751]: time=2025-04-15T19:06:18.110-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 19:06:18 launchpad ollama[1751]: time=2025-04-15T19:06:18.111-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 19:06:18 launchpad ollama[1751]: time=2025-04-15T19:06:18.112-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1087723250/runners
+Apr 15 19:06:21 launchpad ollama[1751]: time=2025-04-15T19:06:21.155-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 19:06:21 launchpad ollama[1751]: time=2025-04-15T19:06:21.156-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 19:06:21 launchpad ollama[1751]: time=2025-04-15T19:06:21.156-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:06:21 launchpad ollama[1751]: time=2025-04-15T19:06:21.157-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:06:21 launchpad ollama[1751]: time=2025-04-15T19:06:21.157-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:06:21 launchpad ollama[1751]: time=2025-04-15T19:06:21.384-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 19:34:06 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 19:34:06 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 19:34:06 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 19:34:06 launchpad systemd[1]: ollama.service: Consumed 3.491s CPU time, 786.7M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 8b2c7e99e0864c20b11a5e72897b1d52 --
+Apr 15 19:34:52 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 19:34:52 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 19:34:53 launchpad ollama[1766]: 2025/04/15 19:34:53 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 19:34:53 launchpad ollama[1766]: time=2025-04-15T19:34:53.066-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 19:34:53 launchpad ollama[1766]: time=2025-04-15T19:34:53.075-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 19:34:53 launchpad ollama[1766]: time=2025-04-15T19:34:53.076-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 19:34:53 launchpad ollama[1766]: time=2025-04-15T19:34:53.078-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama955304025/runners
+Apr 15 19:34:56 launchpad ollama[1766]: time=2025-04-15T19:34:56.057-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 19:34:56 launchpad ollama[1766]: time=2025-04-15T19:34:56.058-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 19:34:56 launchpad ollama[1766]: time=2025-04-15T19:34:56.058-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:34:56 launchpad ollama[1766]: time=2025-04-15T19:34:56.058-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:34:56 launchpad ollama[1766]: time=2025-04-15T19:34:56.058-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:34:57 launchpad ollama[1766]: time=2025-04-15T19:34:57.812-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 15 19:35:18 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 19:35:18 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 19:35:18 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 19:35:18 launchpad systemd[1]: ollama.service: Consumed 5.278s CPU time, 787M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 8c1bec5b01f44177a52bc758ca6ad5c4 --
+Apr 15 19:35:58 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 19:35:58 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 19:35:58 launchpad ollama[1756]: 2025/04/15 19:35:58 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 19:35:58 launchpad ollama[1756]: time=2025-04-15T19:35:58.593-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 19:35:58 launchpad ollama[1756]: time=2025-04-15T19:35:58.602-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 19:35:58 launchpad ollama[1756]: time=2025-04-15T19:35:58.603-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 19:35:58 launchpad ollama[1756]: time=2025-04-15T19:35:58.605-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama326345868/runners
+Apr 15 19:36:01 launchpad ollama[1756]: time=2025-04-15T19:36:01.642-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 19:36:01 launchpad ollama[1756]: time=2025-04-15T19:36:01.643-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 19:36:01 launchpad ollama[1756]: time=2025-04-15T19:36:01.643-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:36:01 launchpad ollama[1756]: time=2025-04-15T19:36:01.643-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:36:01 launchpad ollama[1756]: time=2025-04-15T19:36:01.643-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:36:01 launchpad ollama[1756]: time=2025-04-15T19:36:01.879-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 19:41:41 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 19:41:41 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 19:41:41 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 19:41:41 launchpad systemd[1]: ollama.service: Consumed 3.472s CPU time, 787.4M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 723cc3307f3b4b60b2a7d48a856c4333 --
+Apr 15 19:42:36 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 19:42:36 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 19:42:36 launchpad ollama[1766]: 2025/04/15 19:42:36 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 19:42:36 launchpad ollama[1766]: time=2025-04-15T19:42:36.558-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 19:42:36 launchpad ollama[1766]: time=2025-04-15T19:42:36.566-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 19:42:36 launchpad ollama[1766]: time=2025-04-15T19:42:36.567-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 19:42:36 launchpad ollama[1766]: time=2025-04-15T19:42:36.569-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama135249836/runners
+Apr 15 19:42:39 launchpad ollama[1766]: time=2025-04-15T19:42:39.498-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 15 19:42:39 launchpad ollama[1766]: time=2025-04-15T19:42:39.499-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 19:42:39 launchpad ollama[1766]: time=2025-04-15T19:42:39.499-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:42:39 launchpad ollama[1766]: time=2025-04-15T19:42:39.500-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:42:39 launchpad ollama[1766]: time=2025-04-15T19:42:39.500-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:42:41 launchpad ollama[1766]: time=2025-04-15T19:42:41.372-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 15 19:43:01 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 19:43:01 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 19:43:01 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 19:43:01 launchpad systemd[1]: ollama.service: Consumed 5.316s CPU time, 790.7M memory peak, 237.6M read from disk, 508.1M written to disk.
+-- Boot 046f08952da54b2a93bdfe876ed593e3 --
+Apr 15 19:43:43 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 19:43:44 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 19:43:44 launchpad ollama[1763]: 2025/04/15 19:43:44 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 19:43:44 launchpad ollama[1763]: time=2025-04-15T19:43:44.136-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 19:43:44 launchpad ollama[1763]: time=2025-04-15T19:43:44.145-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 19:43:44 launchpad ollama[1763]: time=2025-04-15T19:43:44.146-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 19:43:44 launchpad ollama[1763]: time=2025-04-15T19:43:44.147-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2763501498/runners
+Apr 15 19:43:47 launchpad ollama[1763]: time=2025-04-15T19:43:47.184-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 19:43:47 launchpad ollama[1763]: time=2025-04-15T19:43:47.184-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 19:43:47 launchpad ollama[1763]: time=2025-04-15T19:43:47.185-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:43:47 launchpad ollama[1763]: time=2025-04-15T19:43:47.185-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:43:47 launchpad ollama[1763]: time=2025-04-15T19:43:47.185-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:43:47 launchpad ollama[1763]: time=2025-04-15T19:43:47.413-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 19:49:05 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 19:49:05 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 19:49:05 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 19:49:05 launchpad systemd[1]: ollama.service: Consumed 3.476s CPU time, 787.7M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 7ec7b52398314a5089d714a76ff8aafd --
+Apr 16 08:49:25 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 08:49:26 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 08:49:26 launchpad ollama[1767]: 2025/04/16 08:49:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 08:49:26 launchpad ollama[1767]: time=2025-04-16T08:49:26.154-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 08:49:26 launchpad ollama[1767]: time=2025-04-16T08:49:26.161-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 08:49:26 launchpad ollama[1767]: time=2025-04-16T08:49:26.162-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 08:49:26 launchpad ollama[1767]: time=2025-04-16T08:49:26.163-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1336750794/runners
+Apr 16 08:49:29 launchpad ollama[1767]: time=2025-04-16T08:49:29.130-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 16 08:49:29 launchpad ollama[1767]: time=2025-04-16T08:49:29.130-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 08:49:29 launchpad ollama[1767]: time=2025-04-16T08:49:29.130-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 08:49:29 launchpad ollama[1767]: time=2025-04-16T08:49:29.131-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 08:49:29 launchpad ollama[1767]: time=2025-04-16T08:49:29.131-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 08:49:29 launchpad ollama[1767]: time=2025-04-16T08:49:29.356-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 09:52:34 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 09:52:34 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 09:52:34 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 09:52:34 launchpad systemd[1]: ollama.service: Consumed 3.443s CPU time, 786.7M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot 67348c505ba8474fac5f64b2b50ede37 --
+Apr 16 09:53:08 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 09:53:08 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 09:53:08 launchpad ollama[1758]: 2025/04/16 09:53:08 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 09:53:08 launchpad ollama[1758]: time=2025-04-16T09:53:08.788-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 09:53:08 launchpad ollama[1758]: time=2025-04-16T09:53:08.796-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 09:53:08 launchpad ollama[1758]: time=2025-04-16T09:53:08.797-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 09:53:08 launchpad ollama[1758]: time=2025-04-16T09:53:08.798-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3764926553/runners
+Apr 16 09:53:11 launchpad ollama[1758]: time=2025-04-16T09:53:11.773-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 16 09:53:11 launchpad ollama[1758]: time=2025-04-16T09:53:11.774-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 09:53:11 launchpad ollama[1758]: time=2025-04-16T09:53:11.774-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 09:53:11 launchpad ollama[1758]: time=2025-04-16T09:53:11.775-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 09:53:11 launchpad ollama[1758]: time=2025-04-16T09:53:11.775-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 09:53:11 launchpad ollama[1758]: time=2025-04-16T09:53:11.991-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 09:58:32 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 09:58:32 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 09:58:32 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 09:58:32 launchpad systemd[1]: ollama.service: Consumed 3.403s CPU time, 787.2M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot 22011e926bda407082bdad0deef3251d --
+Apr 16 09:59:04 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 09:59:04 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 09:59:04 launchpad ollama[1758]: 2025/04/16 09:59:04 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 09:59:04 launchpad ollama[1758]: time=2025-04-16T09:59:04.576-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 09:59:04 launchpad ollama[1758]: time=2025-04-16T09:59:04.586-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 09:59:04 launchpad ollama[1758]: time=2025-04-16T09:59:04.586-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 09:59:04 launchpad ollama[1758]: time=2025-04-16T09:59:04.587-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3002500503/runners
+Apr 16 09:59:07 launchpad ollama[1758]: time=2025-04-16T09:59:07.616-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 16 09:59:07 launchpad ollama[1758]: time=2025-04-16T09:59:07.617-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 09:59:07 launchpad ollama[1758]: time=2025-04-16T09:59:07.617-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 09:59:07 launchpad ollama[1758]: time=2025-04-16T09:59:07.618-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 09:59:07 launchpad ollama[1758]: time=2025-04-16T09:59:07.618-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 09:59:07 launchpad ollama[1758]: time=2025-04-16T09:59:07.833-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 10:04:42 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 10:04:42 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 10:04:42 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 10:04:42 launchpad systemd[1]: ollama.service: Consumed 3.454s CPU time, 787.2M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot 1cdc3f1f0ea5475cae0edd2be2a5d86b --
+Apr 16 10:05:14 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 10:05:14 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 10:05:14 launchpad ollama[1763]: 2025/04/16 10:05:14 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 10:05:14 launchpad ollama[1763]: time=2025-04-16T10:05:14.681-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 10:05:14 launchpad ollama[1763]: time=2025-04-16T10:05:14.691-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 10:05:14 launchpad ollama[1763]: time=2025-04-16T10:05:14.692-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 10:05:14 launchpad ollama[1763]: time=2025-04-16T10:05:14.693-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2353227385/runners
+Apr 16 10:05:17 launchpad ollama[1763]: time=2025-04-16T10:05:17.705-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 16 10:05:17 launchpad ollama[1763]: time=2025-04-16T10:05:17.705-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 10:05:17 launchpad ollama[1763]: time=2025-04-16T10:05:17.706-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:05:17 launchpad ollama[1763]: time=2025-04-16T10:05:17.706-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:05:17 launchpad ollama[1763]: time=2025-04-16T10:05:17.706-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:05:17 launchpad ollama[1763]: time=2025-04-16T10:05:17.920-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 10:16:57 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 10:16:57 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 10:16:57 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 10:16:57 launchpad systemd[1]: ollama.service: Consumed 3.459s CPU time, 786.6M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot e43d69d9549746058c4c3cef5f453029 --
+Apr 16 10:17:29 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 10:17:29 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 10:17:29 launchpad ollama[1751]: 2025/04/16 10:17:29 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 10:17:29 launchpad ollama[1751]: time=2025-04-16T10:17:29.306-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 10:17:29 launchpad ollama[1751]: time=2025-04-16T10:17:29.315-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 10:17:29 launchpad ollama[1751]: time=2025-04-16T10:17:29.316-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 10:17:29 launchpad ollama[1751]: time=2025-04-16T10:17:29.317-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1196898257/runners
+Apr 16 10:17:32 launchpad ollama[1751]: time=2025-04-16T10:17:32.352-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 16 10:17:32 launchpad ollama[1751]: time=2025-04-16T10:17:32.352-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 10:17:32 launchpad ollama[1751]: time=2025-04-16T10:17:32.353-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:17:32 launchpad ollama[1751]: time=2025-04-16T10:17:32.353-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:17:32 launchpad ollama[1751]: time=2025-04-16T10:17:32.353-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:17:32 launchpad ollama[1751]: time=2025-04-16T10:17:32.566-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 10:28:53 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 10:28:54 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 10:28:54 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 10:28:54 launchpad systemd[1]: ollama.service: Consumed 3.429s CPU time, 786.8M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot c129e31b363247f9b053a7c6135bcb92 --
+Apr 16 10:29:25 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 10:29:25 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 10:29:26 launchpad ollama[1756]: 2025/04/16 10:29:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 10:29:26 launchpad ollama[1756]: time=2025-04-16T10:29:26.108-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 10:29:26 launchpad ollama[1756]: time=2025-04-16T10:29:26.116-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 10:29:26 launchpad ollama[1756]: time=2025-04-16T10:29:26.117-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 10:29:26 launchpad ollama[1756]: time=2025-04-16T10:29:26.118-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1140884484/runners
+Apr 16 10:29:29 launchpad ollama[1756]: time=2025-04-16T10:29:29.150-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 16 10:29:29 launchpad ollama[1756]: time=2025-04-16T10:29:29.152-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 10:29:29 launchpad ollama[1756]: time=2025-04-16T10:29:29.152-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:29:29 launchpad ollama[1756]: time=2025-04-16T10:29:29.152-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:29:29 launchpad ollama[1756]: time=2025-04-16T10:29:29.152-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:29:29 launchpad ollama[1756]: time=2025-04-16T10:29:29.373-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 10:49:41 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 10:49:41 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 10:49:41 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 10:49:41 launchpad systemd[1]: ollama.service: Consumed 3.488s CPU time, 786.7M memory peak, 233.9M read from disk, 508.1M written to disk.
+-- Boot b7737e9169e547b3ba5bad348f5ca7cf --
+Apr 16 10:50:13 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 10:50:13 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 10:50:13 launchpad ollama[1541]: 2025/04/16 10:50:13 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 10:50:13 launchpad ollama[1541]: time=2025-04-16T10:50:13.381-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 10:50:13 launchpad ollama[1541]: time=2025-04-16T10:50:13.393-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 10:50:13 launchpad ollama[1541]: time=2025-04-16T10:50:13.393-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 10:50:13 launchpad ollama[1541]: time=2025-04-16T10:50:13.396-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2767395511/runners
+Apr 16 10:50:16 launchpad ollama[1541]: time=2025-04-16T10:50:16.432-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Apr 16 10:50:16 launchpad ollama[1541]: time=2025-04-16T10:50:16.433-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 10:50:16 launchpad ollama[1541]: time=2025-04-16T10:50:16.434-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:50:16 launchpad ollama[1541]: time=2025-04-16T10:50:16.434-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:50:16 launchpad ollama[1541]: time=2025-04-16T10:50:16.434-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:50:16 launchpad ollama[1541]: time=2025-04-16T10:50:16.665-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 12:09:45 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 12:09:45 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 12:09:45 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 12:09:45 launchpad systemd[1]: ollama.service: Consumed 3.522s CPU time, 787M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot d5d86d060a234fe6a240e2a1ffc43b8e --
+Apr 16 12:10:20 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 12:10:20 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 12:10:20 launchpad ollama[1547]: 2025/04/16 12:10:20 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 12:10:20 launchpad ollama[1547]: time=2025-04-16T12:10:20.428-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 12:10:20 launchpad ollama[1547]: time=2025-04-16T12:10:20.440-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 12:10:20 launchpad ollama[1547]: time=2025-04-16T12:10:20.441-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 12:10:20 launchpad ollama[1547]: time=2025-04-16T12:10:20.443-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1129470658/runners
+Apr 16 12:10:23 launchpad ollama[1547]: time=2025-04-16T12:10:23.564-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Apr 16 12:10:23 launchpad ollama[1547]: time=2025-04-16T12:10:23.565-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 12:10:23 launchpad ollama[1547]: time=2025-04-16T12:10:23.565-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 12:10:23 launchpad ollama[1547]: time=2025-04-16T12:10:23.566-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 12:10:23 launchpad ollama[1547]: time=2025-04-16T12:10:23.566-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 12:10:23 launchpad ollama[1547]: time=2025-04-16T12:10:23.801-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 17 08:02:06 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 17 08:02:06 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 17 08:02:06 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 17 08:02:06 launchpad systemd[1]: ollama.service: Consumed 3.847s CPU time, 787.4M memory peak, 233.7M read from disk, 508.1M written to disk.
+-- Boot da5d640ed2764ad2ab3162ee265faccc --
+Apr 17 08:02:39 launchpad systemd[1]: Starting Server for local large language models...
+Apr 17 08:02:39 launchpad systemd[1]: Started Server for local large language models.
+Apr 17 08:02:39 launchpad ollama[1558]: 2025/04/17 08:02:39 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 17 08:02:39 launchpad ollama[1558]: time=2025-04-17T08:02:39.459-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 17 08:02:39 launchpad ollama[1558]: time=2025-04-17T08:02:39.468-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 17 08:02:39 launchpad ollama[1558]: time=2025-04-17T08:02:39.469-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 17 08:02:39 launchpad ollama[1558]: time=2025-04-17T08:02:39.471-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3039898910/runners
+Apr 17 08:02:42 launchpad ollama[1558]: time=2025-04-17T08:02:42.503-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 17 08:02:42 launchpad ollama[1558]: time=2025-04-17T08:02:42.503-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 17 08:02:42 launchpad ollama[1558]: time=2025-04-17T08:02:42.504-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:02:42 launchpad ollama[1558]: time=2025-04-17T08:02:42.504-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:02:42 launchpad ollama[1558]: time=2025-04-17T08:02:42.504-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:02:42 launchpad ollama[1558]: time=2025-04-17T08:02:42.720-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 17 08:04:18 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 17 08:04:19 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 17 08:04:19 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 17 08:04:19 launchpad systemd[1]: ollama.service: Consumed 3.456s CPU time, 787.4M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot 8f7476c1ba934d1db993b63d96926b29 --
+Apr 17 08:04:55 launchpad systemd[1]: Starting Server for local large language models...
+Apr 17 08:04:55 launchpad systemd[1]: Started Server for local large language models.
+Apr 17 08:04:55 launchpad ollama[1554]: 2025/04/17 08:04:55 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 17 08:04:55 launchpad ollama[1554]: time=2025-04-17T08:04:55.299-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 17 08:04:55 launchpad ollama[1554]: time=2025-04-17T08:04:55.310-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 17 08:04:55 launchpad ollama[1554]: time=2025-04-17T08:04:55.311-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 17 08:04:55 launchpad ollama[1554]: time=2025-04-17T08:04:55.313-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2590820481/runners
+Apr 17 08:04:58 launchpad ollama[1554]: time=2025-04-17T08:04:58.342-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 17 08:04:58 launchpad ollama[1554]: time=2025-04-17T08:04:58.342-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 17 08:04:58 launchpad ollama[1554]: time=2025-04-17T08:04:58.342-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:04:58 launchpad ollama[1554]: time=2025-04-17T08:04:58.343-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:04:58 launchpad ollama[1554]: time=2025-04-17T08:04:58.343-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:04:58 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 17 08:04:58 launchpad ollama[1554]: time=2025-04-17T08:04:58.850-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Apr 17 08:04:59 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 17 08:04:59 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 17 08:04:59 launchpad systemd[1]: ollama.service: Consumed 3.448s CPU time, 767.6M memory peak, 234.3M read from disk, 507.3M written to disk.
+-- Boot 758f89d306924c14ac8df7085e6f4c8c --
+Apr 17 08:05:41 launchpad systemd[1]: Starting Server for local large language models...
+Apr 17 08:05:41 launchpad systemd[1]: Started Server for local large language models.
+Apr 17 08:05:41 launchpad ollama[1547]: 2025/04/17 08:05:41 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 17 08:05:41 launchpad ollama[1547]: time=2025-04-17T08:05:41.421-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 17 08:05:41 launchpad ollama[1547]: time=2025-04-17T08:05:41.431-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 17 08:05:41 launchpad ollama[1547]: time=2025-04-17T08:05:41.432-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 17 08:05:41 launchpad ollama[1547]: time=2025-04-17T08:05:41.433-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3042380030/runners
+Apr 17 08:05:44 launchpad ollama[1547]: time=2025-04-17T08:05:44.420-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 17 08:05:44 launchpad ollama[1547]: time=2025-04-17T08:05:44.420-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 17 08:05:44 launchpad ollama[1547]: time=2025-04-17T08:05:44.420-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:05:44 launchpad ollama[1547]: time=2025-04-17T08:05:44.421-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:05:44 launchpad ollama[1547]: time=2025-04-17T08:05:44.421-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:05:44 launchpad ollama[1547]: time=2025-04-17T08:05:44.650-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.8 GiB"
+Apr 17 08:18:18 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 17 08:18:19 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 17 08:18:19 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 17 08:18:19 launchpad systemd[1]: ollama.service: Consumed 3.448s CPU time, 787.4M memory peak, 234.6M read from disk, 508.1M written to disk.
+-- Boot a7504e0bc21940739f8f68c60693dfbc --
+Apr 17 08:18:57 launchpad systemd[1]: Starting Server for local large language models...
+Apr 17 08:18:57 launchpad systemd[1]: Started Server for local large language models.
+Apr 17 08:18:57 launchpad ollama[1563]: 2025/04/17 08:18:57 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 17 08:18:57 launchpad ollama[1563]: time=2025-04-17T08:18:57.484-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 17 08:18:57 launchpad ollama[1563]: time=2025-04-17T08:18:57.493-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 17 08:18:57 launchpad ollama[1563]: time=2025-04-17T08:18:57.495-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 17 08:18:57 launchpad ollama[1563]: time=2025-04-17T08:18:57.495-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1624591756/runners
+Apr 17 08:19:00 launchpad ollama[1563]: time=2025-04-17T08:19:00.473-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 17 08:19:00 launchpad ollama[1563]: time=2025-04-17T08:19:00.473-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 17 08:19:00 launchpad ollama[1563]: time=2025-04-17T08:19:00.473-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:19:00 launchpad ollama[1563]: time=2025-04-17T08:19:00.474-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:19:00 launchpad ollama[1563]: time=2025-04-17T08:19:00.474-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:19:02 launchpad ollama[1563]: time=2025-04-17T08:19:02.187-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 17 08:19:12 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 17 08:19:12 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 17 08:19:12 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 17 08:19:12 launchpad systemd[1]: ollama.service: Consumed 5.391s CPU time, 787.3M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 2d85acb4569a4560b5355bd9c65e285d --
+Apr 17 08:19:44 launchpad systemd[1]: Starting Server for local large language models...
+Apr 17 08:19:44 launchpad systemd[1]: Started Server for local large language models.
+Apr 17 08:19:44 launchpad ollama[1550]: 2025/04/17 08:19:44 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 17 08:19:44 launchpad ollama[1550]: time=2025-04-17T08:19:44.390-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 17 08:19:44 launchpad ollama[1550]: time=2025-04-17T08:19:44.398-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 17 08:19:44 launchpad ollama[1550]: time=2025-04-17T08:19:44.399-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 17 08:19:44 launchpad ollama[1550]: time=2025-04-17T08:19:44.400-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1046240382/runners
+Apr 17 08:19:47 launchpad ollama[1550]: time=2025-04-17T08:19:47.441-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 17 08:19:47 launchpad ollama[1550]: time=2025-04-17T08:19:47.442-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 17 08:19:47 launchpad ollama[1550]: time=2025-04-17T08:19:47.442-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:19:47 launchpad ollama[1550]: time=2025-04-17T08:19:47.443-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:19:47 launchpad ollama[1550]: time=2025-04-17T08:19:47.443-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:19:47 launchpad ollama[1550]: time=2025-04-17T08:19:47.665-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 17 08:33:33 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 17 08:33:33 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 17 08:33:33 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 17 08:33:33 launchpad systemd[1]: ollama.service: Consumed 3.482s CPU time, 787.4M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot 587a9d72538846239e9329ff2ac5527d --
+Apr 17 08:34:06 launchpad systemd[1]: Starting Server for local large language models...
+Apr 17 08:34:06 launchpad systemd[1]: Started Server for local large language models.
+Apr 17 08:34:06 launchpad ollama[1550]: 2025/04/17 08:34:06 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 17 08:34:06 launchpad ollama[1550]: time=2025-04-17T08:34:06.375-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 17 08:34:06 launchpad ollama[1550]: time=2025-04-17T08:34:06.385-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 17 08:34:06 launchpad ollama[1550]: time=2025-04-17T08:34:06.387-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 17 08:34:06 launchpad ollama[1550]: time=2025-04-17T08:34:06.388-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama568753210/runners
+Apr 17 08:34:09 launchpad ollama[1550]: time=2025-04-17T08:34:09.423-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 17 08:34:09 launchpad ollama[1550]: time=2025-04-17T08:34:09.424-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 17 08:34:09 launchpad ollama[1550]: time=2025-04-17T08:34:09.424-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:34:09 launchpad ollama[1550]: time=2025-04-17T08:34:09.425-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:34:09 launchpad ollama[1550]: time=2025-04-17T08:34:09.425-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:34:09 launchpad ollama[1550]: time=2025-04-17T08:34:09.638-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 17 13:16:34 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:16:34 | 200 |    3.804253ms |       127.0.0.1 | GET      "/api/tags"
+Apr 17 13:16:35 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:16:35 | 200 |     601.213µs |       127.0.0.1 | GET      "/api/tags"
+Apr 17 13:16:35 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:16:35 | 200 |       37.78µs |       127.0.0.1 | GET      "/api/version"
+Apr 17 13:16:37 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:16:37 | 200 |      26.101µs |       127.0.0.1 | GET      "/api/version"
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.448-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=8983150592 required="7.7 GiB"
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.448-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.448-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.449-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 35895"
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.450-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.450-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.450-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 13:17:21 launchpad ollama[67599]: INFO [main] build info | build=0 commit="unknown" tid="139854789881856" timestamp=1744921041
+Apr 17 13:17:21 launchpad ollama[67599]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139854789881856" timestamp=1744921041 total_threads=16
+Apr 17 13:17:21 launchpad ollama[67599]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35895" tid="139854789881856" timestamp=1744921041
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.701-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 13:17:21 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 13:17:21 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 13:17:21 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 13:17:21 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 13:17:27 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 13:17:27 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 13:17:27 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 16384
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 13:17:27 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 13:17:27 launchpad ollama[67599]: INFO [main] model loaded | tid="139854789881856" timestamp=1744921047
+Apr 17 13:17:27 launchpad ollama[1550]: time=2025-04-17T13:17:27.719-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Apr 17 13:17:30 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:17:30 | 200 |  9.126188526s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:17:30 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:17:30 | 200 |  426.463896ms |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:17:32 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:17:32 | 200 |  1.669589102s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.617-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8999927808 required="6.2 GiB"
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.617-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.617-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.619-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33245"
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.619-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.619-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.619-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 13:18:11 launchpad ollama[67755]: INFO [main] build info | build=0 commit="unknown" tid="139715506327552" timestamp=1744921091
+Apr 17 13:18:11 launchpad ollama[67755]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139715506327552" timestamp=1744921091 total_threads=16
+Apr 17 13:18:11 launchpad ollama[67755]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33245" tid="139715506327552" timestamp=1744921091
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.870-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 13:18:11 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 13:18:11 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 13:18:11 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 13:18:11 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 13:18:12 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 13:18:12 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 13:18:12 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 13:18:12 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 13:18:12 launchpad ollama[67755]: INFO [main] model loaded | tid="139715506327552" timestamp=1744921092
+Apr 17 13:18:12 launchpad ollama[1550]: time=2025-04-17T13:18:12.874-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 17 13:18:17 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:18:17 | 200 |   6.61916553s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:20:18 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:20:18 | 200 |  4.209518826s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.877-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9026928640 required="6.2 GiB"
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.878-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.4 GiB" free_swap="68.9 GiB"
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.878-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.879-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39669"
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.879-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.879-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.879-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 13:32:03 launchpad ollama[69966]: INFO [main] build info | build=0 commit="unknown" tid="140277726531584" timestamp=1744921923
+Apr 17 13:32:03 launchpad ollama[69966]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140277726531584" timestamp=1744921923 total_threads=16
+Apr 17 13:32:03 launchpad ollama[69966]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39669" tid="140277726531584" timestamp=1744921923
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 13:32:04 launchpad ollama[1550]: time=2025-04-17T13:32:04.131-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 13:32:04 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 13:32:04 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 13:32:04 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 13:32:04 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 13:32:04 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 13:32:04 launchpad ollama[69966]: INFO [main] model loaded | tid="140277726531584" timestamp=1744921924
+Apr 17 13:32:05 launchpad ollama[1550]: time=2025-04-17T13:32:05.135-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 17 13:32:14 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:32:14 | 200 | 11.216938661s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.856-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9027059712 required="6.2 GiB"
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.856-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.856-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.857-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45415"
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.857-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.857-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.858-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 13:46:06 launchpad ollama[72256]: INFO [main] build info | build=0 commit="unknown" tid="139958814859264" timestamp=1744922766
+Apr 17 13:46:06 launchpad ollama[72256]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139958814859264" timestamp=1744922766 total_threads=16
+Apr 17 13:46:06 launchpad ollama[72256]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45415" tid="139958814859264" timestamp=1744922766
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 13:46:07 launchpad ollama[1550]: time=2025-04-17T13:46:07.109-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 13:46:07 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 13:46:07 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 13:46:07 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 13:46:07 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 13:46:07 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 13:46:07 launchpad ollama[72256]: INFO [main] model loaded | tid="139958814859264" timestamp=1744922767
+Apr 17 13:46:08 launchpad ollama[1550]: time=2025-04-17T13:46:08.112-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 17 13:46:18 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:46:18 | 200 | 11.746749074s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.266-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9132834816 required="6.2 GiB"
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.266-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.3 GiB" free_swap="68.9 GiB"
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.267-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.268-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44429"
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.268-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.268-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.268-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 13:52:39 launchpad ollama[73282]: INFO [main] build info | build=0 commit="unknown" tid="139760036306944" timestamp=1744923159
+Apr 17 13:52:39 launchpad ollama[73282]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139760036306944" timestamp=1744923159 total_threads=16
+Apr 17 13:52:39 launchpad ollama[73282]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44429" tid="139760036306944" timestamp=1744923159
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.519-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 13:52:39 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 13:52:39 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 13:52:39 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 13:52:39 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 13:52:40 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 13:52:40 launchpad ollama[73282]: INFO [main] model loaded | tid="139760036306944" timestamp=1744923160
+Apr 17 13:52:40 launchpad ollama[1550]: time=2025-04-17T13:52:40.522-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 17 13:52:49 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:52:49 | 200 | 10.664116085s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.891-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8997306368 required="6.2 GiB"
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.891-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.892-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.893-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42783"
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.893-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.893-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.893-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 13:59:27 launchpad ollama[74490]: INFO [main] build info | build=0 commit="unknown" tid="139996302262272" timestamp=1744923567
+Apr 17 13:59:27 launchpad ollama[74490]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139996302262272" timestamp=1744923567 total_threads=16
+Apr 17 13:59:27 launchpad ollama[74490]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42783" tid="139996302262272" timestamp=1744923567
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 13:59:28 launchpad ollama[1550]: time=2025-04-17T13:59:28.144-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 13:59:28 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 13:59:28 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 13:59:28 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 13:59:28 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 13:59:28 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 13:59:28 launchpad ollama[74490]: INFO [main] model loaded | tid="139996302262272" timestamp=1744923568
+Apr 17 13:59:29 launchpad ollama[1550]: time=2025-04-17T13:59:29.147-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 17 13:59:36 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:59:36 | 200 |  8.726327132s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:00:16 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:00:16 | 200 |  4.472001558s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:00:21 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:00:21 | 200 |  4.418741149s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:00:24 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:00:24 | 200 |  2.612630528s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:02:11 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:02:11 | 200 |  1.208514073s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:02:12 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:02:12 | 200 |  177.531961ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:02:17 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:02:17 | 200 |  178.745028ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:02:18 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:02:18 | 200 |  835.093664ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:02:21 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:02:21 | 200 |  3.020288958s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:06:20 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:20 | 200 |  3.408414633s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:24 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1234 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744923984
+Apr 17 14:06:25 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:25 | 200 |  1.077575454s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:06:26 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:26 | 200 |  996.106633ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:27 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:27 | 200 |  996.704268ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:28 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:28 | 200 |  938.107467ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:29 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:29 | 200 |   915.24396ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:30 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:30 | 200 |  998.805303ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:31 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:31 | 200 |  1.384641956s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:38 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:38 | 200 |  6.897968821s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:07:44 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=5065 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924064
+Apr 17 14:07:50 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:50 | 200 |  6.262280302s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:07:50 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924070
+Apr 17 14:07:51 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:51 | 200 |  1.178453488s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:07:51 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924071
+Apr 17 14:07:52 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:52 | 200 |  1.176354245s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:07:53 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:53 | 200 |  1.012140902s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:07:53 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924073
+Apr 17 14:07:54 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:54 | 200 |  1.057069122s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:07:55 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924075
+Apr 17 14:07:56 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:56 | 200 |  1.141288272s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:07:57 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:57 | 200 |  1.506940093s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:07:57 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1392 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924077
+Apr 17 14:08:02 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:08:02 | 200 |  4.822690255s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:09:19 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1864 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924159
+Apr 17 14:09:22 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:22 | 200 |  3.301964076s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:09:23 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:23 | 200 |  177.759381ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:09:23 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:23 | 200 |  175.241286ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:09:24 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:24 | 200 |  938.075968ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:09:24 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:24 | 200 |   95.604798ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:09:24 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:24 | 200 |   175.74611ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:09:25 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:25 | 200 |  1.302571234s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:09:31 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:31 | 200 |  6.045540856s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:10:44 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:44 | 200 |  1.275281542s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:10:46 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:46 | 200 |  179.586836ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:10:46 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:46 | 200 |  182.940639ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:10:47 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:47 | 200 |  919.367071ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:10:47 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:47 | 200 |   95.890195ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:10:48 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:48 | 200 |  177.006476ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:10:49 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:49 | 200 |  1.202086651s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:10:54 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:54 | 200 |  5.428032754s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:14:12 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:12 | 200 |   1.15313724s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:14:13 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:13 | 200 |  198.856371ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:14:13 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:13 | 200 |  199.621843ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:14:14 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:14 | 200 |  1.169946923s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:14:14 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:14 | 200 |   77.287107ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:14:14 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:14 | 200 |  199.272619ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:14:15 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:15 | 200 |  1.118591732s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:14:23 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:23 | 200 |  7.988811445s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:16:56 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1485 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924616
+Apr 17 14:17:04 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:04 | 200 |  7.620892723s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:17:05 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:05 | 200 |   1.01311995s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:17:06 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:06 | 200 |  1.013174825s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:17:07 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:07 | 200 |  851.051034ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:17:07 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:07 | 200 |  932.455377ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:17:08 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:08 | 200 |  971.281598ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:17:10 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:10 | 200 |  1.430668485s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:17:17 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:17 | 200 |  7.480247037s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.811-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9012707328 required="6.2 GiB"
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.811-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.1 GiB" free_swap="68.9 GiB"
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.812-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.813-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33917"
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.813-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.813-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.813-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 14:22:25 launchpad ollama[78330]: INFO [main] build info | build=0 commit="unknown" tid="139956753145856" timestamp=1744924945
+Apr 17 14:22:25 launchpad ollama[78330]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139956753145856" timestamp=1744924945 total_threads=16
+Apr 17 14:22:25 launchpad ollama[78330]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33917" tid="139956753145856" timestamp=1744924945
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 14:22:26 launchpad ollama[1550]: time=2025-04-17T14:22:26.063-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 14:22:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 14:22:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 14:22:26 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 14:22:26 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 14:22:26 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 14:22:26 launchpad ollama[78330]: INFO [main] model loaded | tid="139956753145856" timestamp=1744924946
+Apr 17 14:22:27 launchpad ollama[1550]: time=2025-04-17T14:22:27.067-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 17 14:22:29 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:29 | 200 |  3.836944271s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:22:32 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:32 | 200 |  194.112954ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:22:32 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:32 | 200 |  193.831037ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:22:33 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:33 | 200 |  833.424983ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:22:33 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:33 | 200 |  106.871435ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:22:33 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:33 | 200 |  189.877355ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:22:34 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:34 | 200 |  940.721332ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:22:43 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:43 | 200 |  8.678793145s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:26:25 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:25 | 200 |  1.458377354s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:26:26 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:26 | 200 |  218.019292ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:26:26 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:26 | 200 |  216.401377ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:26:27 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:27 | 200 |  1.138773958s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:26:27 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:27 | 200 |   95.320243ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:26:27 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:27 | 200 |  217.148759ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:26:28 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:28 | 200 |  1.116623902s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:26:34 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:34 | 200 |  6.081919408s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.672-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9008578560 required="6.2 GiB"
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.672-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.9 GiB" free_swap="68.9 GiB"
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.673-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.673-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46437"
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.674-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.674-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.674-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 14:34:50 launchpad ollama[80427]: INFO [main] build info | build=0 commit="unknown" tid="139638404534272" timestamp=1744925690
+Apr 17 14:34:50 launchpad ollama[80427]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139638404534272" timestamp=1744925690 total_threads=16
+Apr 17 14:34:50 launchpad ollama[80427]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46437" tid="139638404534272" timestamp=1744925690
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.925-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 14:34:50 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 14:34:50 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 14:34:50 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 14:34:50 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 14:34:51 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 14:34:51 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 14:34:51 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 14:34:51 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 14:34:51 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 14:34:51 launchpad ollama[80427]: INFO [main] model loaded | tid="139638404534272" timestamp=1744925691
+Apr 17 14:34:51 launchpad ollama[1550]: time=2025-04-17T14:34:51.928-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 17 14:34:52 launchpad ollama[80427]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=2854 n_keep=24 n_left=2024 n_shift=1012 tid="139638404534272" timestamp=1744925692
+Apr 17 14:34:58 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:34:58 | 200 |  8.378664871s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:35:00 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:00 | 200 |  1.390434188s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:35:01 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:01 | 200 |   1.38910173s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:35:02 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:02 | 200 |  819.416884ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:35:03 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:03 | 200 |  1.350342709s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:35:05 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:05 | 200 |  1.390120929s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:35:06 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:06 | 200 |  947.665362ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:35:12 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:12 | 200 |  6.235460047s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:37:55 launchpad ollama[80427]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1140 n_keep=24 n_left=2024 n_shift=1012 tid="139638404534272" timestamp=1744925875
+Apr 17 14:37:57 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:57 | 200 |  1.562747591s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:37:57 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:57 | 200 |  198.065407ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:37:57 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:57 | 200 |  196.614327ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:37:58 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:58 | 200 |  792.646508ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:37:58 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:58 | 200 |  116.488537ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:37:58 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:58 | 200 |  197.848106ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:37:59 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:59 | 200 |  1.083108318s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:38:06 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:38:06 | 200 |  6.245748834s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:40:36 launchpad ollama[80427]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=2459 n_keep=24 n_left=2024 n_shift=1012 tid="139638404534272" timestamp=1744926036
+Apr 17 14:40:43 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:43 | 200 |  7.247812272s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:40:44 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:44 | 200 |  1.584068257s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:40:46 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:46 | 200 |  1.583251256s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:40:47 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:47 | 200 |  1.342984628s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:40:49 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:49 | 200 |  1.501316011s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:40:50 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:50 | 200 |  1.584905153s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:40:52 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:52 | 200 |  1.448927399s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:40:54 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:54 | 200 |  2.424130034s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:41:37 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:37 | 200 |  3.336877882s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:41:37 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:37 | 200 |   179.73678ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:41:37 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:37 | 200 |  179.144489ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:41:38 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:38 | 200 |  906.964389ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:41:38 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:38 | 200 |   97.826816ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:41:38 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:38 | 200 |  177.931558ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:41:39 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:39 | 200 |  1.164327037s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:41:44 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:44 | 200 |  5.028012933s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:42:47 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:47 | 200 |  909.093326ms |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:42:48 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:48 | 200 |  177.833467ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:42:48 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:48 | 200 |  178.264789ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:42:49 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:49 | 200 |  925.001297ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:42:49 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:49 | 200 |   94.540628ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:42:49 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:49 | 200 |  177.681184ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:42:50 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:50 | 200 |  1.290448752s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:42:54 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:54 | 200 |  4.047281252s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:44:53 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:53 | 200 |  1.442522939s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:44:54 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:54 | 200 |  184.355254ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:44:54 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:54 | 200 |  181.365681ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:44:55 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:55 | 200 |  913.135584ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:44:55 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:55 | 200 |  101.320559ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:44:55 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:55 | 200 |  179.596933ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:44:56 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:56 | 200 |  1.107232509s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:45:03 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:45:03 | 200 |   6.44463933s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:47:13 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:13 | 200 |  992.647304ms |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:47:13 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:13 | 200 |  198.583609ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:47:13 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:13 | 200 |  198.038833ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:47:14 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:14 | 200 |  1.149953098s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:47:15 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:15 | 200 |  115.614971ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:47:15 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:15 | 200 |  196.497174ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:47:16 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:16 | 200 |  1.049435549s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:47:20 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:20 | 200 |  4.464160932s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:49:00 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:00 | 200 |  1.985255122s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:49:11 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:11 | 200 |  200.712298ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:49:11 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:11 | 200 |  201.168477ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:49:12 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:12 | 200 |  816.529433ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:49:12 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:12 | 200 |  115.629762ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:49:12 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:12 | 200 |  198.188649ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:49:13 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:13 | 200 |  1.292137545s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:49:18 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:18 | 200 |  4.963518908s |       127.0.0.1 | POST     "/api/chat"
+Apr 18 16:07:49 launchpad ollama[1550]: [GIN] 2025/04/18 - 16:07:49 | 200 |     916.824µs |       127.0.0.1 | GET      "/api/tags"
+Apr 18 16:07:49 launchpad ollama[1550]: [GIN] 2025/04/18 - 16:07:49 | 200 |      26.637µs |       127.0.0.1 | GET      "/api/version"
+Apr 19 11:49:21 launchpad ollama[1550]: [GIN] 2025/04/19 - 11:49:21 | 200 |     616.613µs |       127.0.0.1 | GET      "/api/tags"
+Apr 19 11:49:21 launchpad ollama[1550]: [GIN] 2025/04/19 - 11:49:21 | 200 |      29.302µs |       127.0.0.1 | GET      "/api/version"
+Apr 19 11:49:24 launchpad ollama[1550]: [GIN] 2025/04/19 - 11:49:24 | 200 |      24.758µs |       127.0.0.1 | GET      "/api/version"
+Apr 19 11:53:35 launchpad ollama[1550]: time=2025-04-19T11:53:35.878-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.046-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.046-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.047-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 43649"
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.047-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.047-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.048-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 19 11:53:36 launchpad ollama[286489]: INFO [main] build info | build=0 commit="unknown" tid="139631567757312" timestamp=1745088816
+Apr 19 11:53:36 launchpad ollama[286489]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139631567757312" timestamp=1745088816 total_threads=16
+Apr 19 11:53:36 launchpad ollama[286489]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43649" tid="139631567757312" timestamp=1745088816
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 19 11:53:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 19 11:53:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 19 11:53:36 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 19 11:53:36 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.299-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 19 11:53:43 launchpad ollama[1550]: llm_load_tensors: offloading 37 repeating layers to GPU
+Apr 19 11:53:43 launchpad ollama[1550]: llm_load_tensors: offloaded 37/41 layers to GPU
+Apr 19 11:53:43 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 19 11:53:43 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 19 11:53:44 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 37
+Apr 19 11:53:44 launchpad ollama[286489]: INFO [main] model loaded | tid="139631567757312" timestamp=1745088824
+Apr 19 11:53:44 launchpad ollama[1550]: time=2025-04-19T11:53:44.572-07:00 level=INFO source=server.go:626 msg="llama runner started in 8.52 seconds"
+Apr 19 11:53:54 launchpad ollama[1550]: [GIN] 2025/04/19 - 11:53:54 | 200 | 18.760340136s |       127.0.0.1 | POST     "/api/chat"
+Apr 19 11:53:54 launchpad ollama[1550]: time=2025-04-19T11:53:54.692-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 19 11:53:55 launchpad ollama[1550]: [GIN] 2025/04/19 - 11:53:55 | 200 |  1.040315847s |       127.0.0.1 | POST     "/api/chat"
+Apr 19 11:53:55 launchpad ollama[1550]: time=2025-04-19T11:53:55.764-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 19 11:53:58 launchpad ollama[1550]: [GIN] 2025/04/19 - 11:53:58 | 200 |  2.668759449s |       127.0.0.1 | POST     "/api/chat"
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.063-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.225-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.225-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.226-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 45227"
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.226-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.226-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.226-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 20 09:02:31 launchpad ollama[360999]: INFO [main] build info | build=0 commit="unknown" tid="139967782432768" timestamp=1745164951
+Apr 20 09:02:31 launchpad ollama[360999]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139967782432768" timestamp=1745164951 total_threads=16
+Apr 20 09:02:31 launchpad ollama[360999]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45227" tid="139967782432768" timestamp=1745164951
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 20 09:02:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 20 09:02:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 20 09:02:31 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 20 09:02:31 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.532-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_tensors: offloading 37 repeating layers to GPU
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_tensors: offloaded 37/41 layers to GPU
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 20 09:02:32 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 37
+Apr 20 09:02:32 launchpad ollama[360999]: INFO [main] model loaded | tid="139967782432768" timestamp=1745164952
+Apr 20 09:02:32 launchpad ollama[1550]: time=2025-04-20T09:02:32.535-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 20 09:02:51 launchpad ollama[1550]: [GIN] 2025/04/20 - 09:02:51 | 200 | 20.476372249s |       127.0.0.1 | POST     "/api/chat"
+Apr 20 16:27:22 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:27:22 | 200 |      733.04µs |       127.0.0.1 | GET      "/api/tags"
+Apr 20 16:27:22 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:27:22 | 200 |      31.489µs |       127.0.0.1 | GET      "/api/version"
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.513-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.673-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.673-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.674-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 32789"
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.674-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.674-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.675-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 20 16:29:50 launchpad ollama[439938]: INFO [main] build info | build=0 commit="unknown" tid="140632346476544" timestamp=1745191790
+Apr 20 16:29:50 launchpad ollama[439938]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140632346476544" timestamp=1745191790 total_threads=16
+Apr 20 16:29:50 launchpad ollama[439938]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32789" tid="140632346476544" timestamp=1745191790
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 20 16:29:50 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 20 16:29:50 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 20 16:29:50 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 20 16:29:50 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.980-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_tensors: offloading 37 repeating layers to GPU
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_tensors: offloaded 37/41 layers to GPU
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 20 16:29:51 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 37
+Apr 20 16:29:51 launchpad ollama[439938]: INFO [main] model loaded | tid="140632346476544" timestamp=1745191791
+Apr 20 16:29:51 launchpad ollama[1550]: time=2025-04-20T16:29:51.984-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 20 16:30:12 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:30:12 | 200 | 22.480292747s |       127.0.0.1 | POST     "/api/chat"
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.482-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.638-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.639-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.640-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 35699"
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.640-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.640-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.640-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 20 16:40:47 launchpad ollama[449214]: INFO [main] build info | build=0 commit="unknown" tid="140354029445120" timestamp=1745192447
+Apr 20 16:40:47 launchpad ollama[449214]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140354029445120" timestamp=1745192447 total_threads=16
+Apr 20 16:40:47 launchpad ollama[449214]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35699" tid="140354029445120" timestamp=1745192447
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 20 16:40:47 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 20 16:40:47 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 20 16:40:47 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 20 16:40:47 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.939-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_tensors: offloading 37 repeating layers to GPU
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_tensors: offloaded 37/41 layers to GPU
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 20 16:40:48 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 37
+Apr 20 16:40:48 launchpad ollama[449214]: INFO [main] model loaded | tid="140354029445120" timestamp=1745192448
+Apr 20 16:40:48 launchpad ollama[1550]: time=2025-04-20T16:40:48.942-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 20 16:41:03 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:41:03 | 200 |  16.09114742s |       127.0.0.1 | POST     "/api/chat"
+Apr 20 16:42:50 launchpad ollama[1550]: time=2025-04-20T16:42:50.598-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 20 16:42:59 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:42:59 | 200 |  8.623545005s |       127.0.0.1 | POST     "/api/chat"
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.639-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.797-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.797-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.798-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 42229"
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.798-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.798-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.798-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 20 16:50:46 launchpad ollama[457113]: INFO [main] build info | build=0 commit="unknown" tid="139952802324480" timestamp=1745193046
+Apr 20 16:50:46 launchpad ollama[457113]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139952802324480" timestamp=1745193046 total_threads=16
+Apr 20 16:50:46 launchpad ollama[457113]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42229" tid="139952802324480" timestamp=1745193046
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 20 16:50:46 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 20 16:50:46 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 20 16:50:46 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 20 16:50:46 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: time=2025-04-20T16:50:47.096-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 20 16:50:47 launchpad ollama[1550]: llm_load_tensors: offloading 37 repeating layers to GPU
+Apr 20 16:50:47 launchpad ollama[1550]: llm_load_tensors: offloaded 37/41 layers to GPU
+Apr 20 16:50:47 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 20 16:50:47 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 37
+Apr 20 16:50:47 launchpad ollama[457113]: INFO [main] model loaded | tid="139952802324480" timestamp=1745193047
+Apr 20 16:50:48 launchpad ollama[1550]: time=2025-04-20T16:50:48.100-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 20 16:51:11 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:51:11 | 200 | 24.693625096s |       127.0.0.1 | POST     "/api/chat"
+Apr 20 16:52:28 launchpad ollama[1550]: time=2025-04-20T16:52:28.274-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 20 16:52:46 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:52:46 | 200 | 18.600588561s |       127.0.0.1 | POST     "/api/chat"
+Apr 21 10:25:40 launchpad ollama[1550]: [GIN] 2025/04/21 - 10:25:40 | 200 |     650.481µs |       127.0.0.1 | GET      "/api/tags"
+Apr 21 10:25:40 launchpad ollama[1550]: [GIN] 2025/04/21 - 10:25:40 | 200 |      38.786µs |       127.0.0.1 | GET      "/api/version"
+Apr 21 10:59:13 launchpad ollama[1550]: [GIN] 2025/04/21 - 10:59:13 | 200 |     702.398µs |       127.0.0.1 | GET      "/api/tags"
+Apr 21 10:59:13 launchpad ollama[1550]: [GIN] 2025/04/21 - 10:59:13 | 200 |      31.923µs |       127.0.0.1 | GET      "/api/version"
+Apr 21 12:20:05 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:20:05 | 200 |       28.22µs |       127.0.0.1 | GET      "/api/version"
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.880-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9231532032 required="6.2 GiB"
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.880-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.4 GiB" free_swap="68.9 GiB"
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.880-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.881-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46503"
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.882-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.882-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.882-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 21 12:20:27 launchpad ollama[520568]: INFO [main] build info | build=0 commit="unknown" tid="139654903910400" timestamp=1745263227
+Apr 21 12:20:27 launchpad ollama[520568]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139654903910400" timestamp=1745263227 total_threads=16
+Apr 21 12:20:27 launchpad ollama[520568]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46503" tid="139654903910400" timestamp=1745263227
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 21 12:20:28 launchpad ollama[1550]: time=2025-04-21T12:20:28.132-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 21 12:20:28 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 21 12:20:28 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 21 12:20:28 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 21 12:20:28 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 21 12:20:28 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 21 12:20:28 launchpad ollama[520568]: INFO [main] model loaded | tid="139654903910400" timestamp=1745263228
+Apr 21 12:20:29 launchpad ollama[1550]: time=2025-04-21T12:20:29.136-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 21 12:20:29 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:20:29 | 200 |  1.926670109s |       127.0.0.1 | POST     "/api/embed"
+Apr 21 12:20:30 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:20:30 | 200 |  484.286454ms |       127.0.0.1 | POST     "/api/embed"
+Apr 21 12:20:30 launchpad ollama[1550]: time=2025-04-21T12:20:30.314-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.014-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9232056320 required="6.5 GiB"
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.014-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.4 GiB" free_swap="68.9 GiB"
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.014-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.015-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38327"
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.016-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.016-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.016-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 21 12:20:31 launchpad ollama[520598]: INFO [main] build info | build=0 commit="unknown" tid="140185823301632" timestamp=1745263231
+Apr 21 12:20:31 launchpad ollama[520598]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140185823301632" timestamp=1745263231 total_threads=16
+Apr 21 12:20:31 launchpad ollama[520598]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38327" tid="140185823301632" timestamp=1745263231
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - type  f32:   66 tensors
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - type q4_K:  193 tensors
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - type q6_K:   33 tensors
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.267-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 21 12:20:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 21 12:20:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 21 12:20:31 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 21 12:20:31 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 21 12:20:35 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 21 12:20:35 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 21 12:20:35 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 21 12:20:35 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 21 12:20:35 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 21 12:20:36 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 21 12:20:36 launchpad ollama[520598]: INFO [main] model loaded | tid="140185823301632" timestamp=1745263236
+Apr 21 12:20:36 launchpad ollama[1550]: time=2025-04-21T12:20:36.532-07:00 level=INFO source=server.go:626 msg="llama runner started in 5.52 seconds"
+Apr 21 12:20:51 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:20:51 | 200 | 21.384301145s |       127.0.0.1 | POST     "/api/chat"
+Apr 21 12:20:53 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:20:53 | 200 |  1.694659971s |       127.0.0.1 | POST     "/api/chat"
+Apr 21 12:20:55 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:20:55 | 200 |    1.7042573s |       127.0.0.1 | POST     "/api/chat"
+Apr 21 12:22:19 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:22:19 | 200 |      26.357µs |       127.0.0.1 | GET      "/api/version"
+Apr 22 16:53:45 launchpad ollama[1550]: [GIN] 2025/04/22 - 16:53:45 | 200 |     624.913µs |       127.0.0.1 | GET      "/api/tags"
+Apr 22 16:53:45 launchpad ollama[1550]: [GIN] 2025/04/22 - 16:53:45 | 200 |      36.386µs |       127.0.0.1 | GET      "/api/version"
+Apr 22 16:53:53 launchpad ollama[1550]: [GIN] 2025/04/22 - 16:53:53 | 200 |      25.526µs |       127.0.0.1 | GET      "/api/version"
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.803-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8910471168 required="6.2 GiB"
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.803-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.803-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.804-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39735"
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.804-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.804-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.805-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 16:54:56 launchpad ollama[662024]: INFO [main] build info | build=0 commit="unknown" tid="140233234300928" timestamp=1745366096
+Apr 22 16:54:56 launchpad ollama[662024]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140233234300928" timestamp=1745366096 total_threads=16
+Apr 22 16:54:56 launchpad ollama[662024]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39735" tid="140233234300928" timestamp=1745366096
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 16:54:57 launchpad ollama[1550]: time=2025-04-22T16:54:57.056-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 16:54:57 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 16:54:57 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 16:54:57 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 16:54:57 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 16:54:57 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 16:54:57 launchpad ollama[662024]: INFO [main] model loaded | tid="140233234300928" timestamp=1745366097
+Apr 22 16:54:58 launchpad ollama[1550]: time=2025-04-22T16:54:58.059-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 16:54:59 launchpad ollama[1550]: [GIN] 2025/04/22 - 16:54:59 | 200 |  3.133858652s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 16:55:00 launchpad ollama[1550]: [GIN] 2025/04/22 - 16:55:00 | 200 |   309.42198ms |       127.0.0.1 | POST     "/api/chat"
+Apr 22 16:55:01 launchpad ollama[1550]: [GIN] 2025/04/22 - 16:55:01 | 200 |  1.346155592s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.227-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9045213184 required="6.2 GiB"
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.227-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.1 GiB" free_swap="68.9 GiB"
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.227-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.228-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35675"
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.228-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.228-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.229-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 17:04:24 launchpad ollama[663467]: INFO [main] build info | build=0 commit="unknown" tid="140574638034944" timestamp=1745366664
+Apr 22 17:04:24 launchpad ollama[663467]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140574638034944" timestamp=1745366664 total_threads=16
+Apr 22 17:04:24 launchpad ollama[663467]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35675" tid="140574638034944" timestamp=1745366664
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.480-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 17:04:24 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 17:04:24 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 17:04:24 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 17:04:24 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 17:04:25 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 17:04:25 launchpad ollama[663467]: INFO [main] model loaded | tid="140574638034944" timestamp=1745366665
+Apr 22 17:04:25 launchpad ollama[1550]: time=2025-04-22T17:04:25.483-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 17:04:35 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:04:35 | 200 | 11.442816882s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:07:21 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:07:21 | 200 |  6.018450452s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:09:40 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:09:40 | 200 |   8.79640683s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.304-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9008644096 required="6.2 GiB"
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.304-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.304-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.305-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38425"
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.305-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.305-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.305-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 17:16:27 launchpad ollama[665429]: INFO [main] build info | build=0 commit="unknown" tid="140393734627328" timestamp=1745367387
+Apr 22 17:16:27 launchpad ollama[665429]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140393734627328" timestamp=1745367387 total_threads=16
+Apr 22 17:16:27 launchpad ollama[665429]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38425" tid="140393734627328" timestamp=1745367387
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.557-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 17:16:27 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 17:16:27 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 17:16:27 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 17:16:27 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 17:16:28 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 17:16:28 launchpad ollama[665429]: INFO [main] model loaded | tid="140393734627328" timestamp=1745367388
+Apr 22 17:16:28 launchpad ollama[1550]: time=2025-04-22T17:16:28.560-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 22 17:16:36 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:16:36 | 200 |  9.136432762s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.594-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8955559936 required="6.2 GiB"
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.594-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.1 GiB" free_swap="68.9 GiB"
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.594-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.595-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44483"
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.595-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.595-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.596-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 17:22:37 launchpad ollama[666377]: INFO [main] build info | build=0 commit="unknown" tid="139861360054272" timestamp=1745367757
+Apr 22 17:22:37 launchpad ollama[666377]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139861360054272" timestamp=1745367757 total_threads=16
+Apr 22 17:22:37 launchpad ollama[666377]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44483" tid="139861360054272" timestamp=1745367757
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.846-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 17:22:37 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 17:22:37 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 17:22:37 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 17:22:37 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 17:22:38 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 17:22:38 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 17:22:38 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 17:22:38 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 17:22:38 launchpad ollama[666377]: INFO [main] model loaded | tid="139861360054272" timestamp=1745367758
+Apr 22 17:22:38 launchpad ollama[1550]: time=2025-04-22T17:22:38.851-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 22 17:22:47 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:22:47 | 200 |  9.620504572s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:27:19 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:27:19 | 200 |  6.365838487s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:32:07 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:32:07 | 200 |  8.012798901s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:34:22 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:34:22 | 200 |  9.885575052s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:37:27 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:37:27 | 200 | 11.185556841s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.490-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8993046528 required="6.2 GiB"
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.490-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.8 GiB" free_swap="68.9 GiB"
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.490-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.491-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41491"
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.491-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.491-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.492-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 17:43:40 launchpad ollama[669773]: INFO [main] build info | build=0 commit="unknown" tid="140105416462336" timestamp=1745369020
+Apr 22 17:43:40 launchpad ollama[669773]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140105416462336" timestamp=1745369020 total_threads=16
+Apr 22 17:43:40 launchpad ollama[669773]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41491" tid="140105416462336" timestamp=1745369020
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.742-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 17:43:40 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 17:43:40 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 17:43:40 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 17:43:40 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 17:43:41 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 17:43:41 launchpad ollama[669773]: INFO [main] model loaded | tid="140105416462336" timestamp=1745369021
+Apr 22 17:43:41 launchpad ollama[1550]: time=2025-04-22T17:43:41.746-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 17:43:53 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:43:53 | 200 | 13.312830298s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.642-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9031450624 required="6.2 GiB"
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.642-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.8 GiB" free_swap="68.9 GiB"
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.643-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.644-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41395"
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.644-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.644-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.644-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 18:06:00 launchpad ollama[673206]: INFO [main] build info | build=0 commit="unknown" tid="140079229906944" timestamp=1745370360
+Apr 22 18:06:00 launchpad ollama[673206]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140079229906944" timestamp=1745370360 total_threads=16
+Apr 22 18:06:00 launchpad ollama[673206]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41395" tid="140079229906944" timestamp=1745370360
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.895-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 18:06:00 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 18:06:00 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 18:06:00 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 18:06:00 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 18:06:01 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 18:06:01 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 18:06:01 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 18:06:01 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 18:06:01 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 18:06:01 launchpad ollama[673206]: INFO [main] model loaded | tid="140079229906944" timestamp=1745370361
+Apr 22 18:06:01 launchpad ollama[1550]: time=2025-04-22T18:06:01.898-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 18:06:13 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:06:13 | 200 | 13.014326109s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.602-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8910274560 required="6.2 GiB"
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.602-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.9 GiB" free_swap="68.9 GiB"
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.602-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.603-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38957"
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.603-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.604-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.604-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 18:17:23 launchpad ollama[674998]: INFO [main] build info | build=0 commit="unknown" tid="140297879146496" timestamp=1745371043
+Apr 22 18:17:23 launchpad ollama[674998]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140297879146496" timestamp=1745371043 total_threads=16
+Apr 22 18:17:23 launchpad ollama[674998]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38957" tid="140297879146496" timestamp=1745371043
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.855-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 18:17:23 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 18:17:23 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 18:17:23 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 18:17:23 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 18:17:24 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 18:17:24 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 18:17:24 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 18:17:24 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 18:17:24 launchpad ollama[674998]: INFO [main] model loaded | tid="140297879146496" timestamp=1745371044
+Apr 22 18:17:24 launchpad ollama[1550]: time=2025-04-22T18:17:24.858-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 18:17:37 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:17:37 | 200 | 14.234576356s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.529-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8985640960 required="6.2 GiB"
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.529-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.9 GiB" free_swap="68.9 GiB"
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.529-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.530-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38355"
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.530-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.530-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.530-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 18:25:20 launchpad ollama[676233]: INFO [main] build info | build=0 commit="unknown" tid="140178250317824" timestamp=1745371520
+Apr 22 18:25:20 launchpad ollama[676233]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140178250317824" timestamp=1745371520 total_threads=16
+Apr 22 18:25:20 launchpad ollama[676233]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38355" tid="140178250317824" timestamp=1745371520
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.781-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 18:25:20 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 18:25:20 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 18:25:20 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 18:25:20 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 18:25:21 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 18:25:21 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 18:25:21 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 18:25:21 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 18:25:21 launchpad ollama[676233]: INFO [main] model loaded | tid="140178250317824" timestamp=1745371521
+Apr 22 18:25:21 launchpad ollama[1550]: time=2025-04-22T18:25:21.785-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 22 18:25:27 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:25:27 | 200 |  7.044238455s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:27:46 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:27:46 | 200 |   5.82328305s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.993-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8987344896 required="6.2 GiB"
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.993-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.9 GiB" free_swap="68.9 GiB"
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.993-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.994-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42383"
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.994-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.994-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.995-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 18:33:22 launchpad ollama[677442]: INFO [main] build info | build=0 commit="unknown" tid="140251518558208" timestamp=1745372002
+Apr 22 18:33:22 launchpad ollama[677442]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140251518558208" timestamp=1745372002 total_threads=16
+Apr 22 18:33:22 launchpad ollama[677442]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42383" tid="140251518558208" timestamp=1745372002
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 18:33:22 launchpad ollama[1550]: time=2025-04-22T18:33:22.246-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 18:33:22 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 18:33:22 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 18:33:22 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 18:33:22 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 18:33:22 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 18:33:23 launchpad ollama[677442]: INFO [main] model loaded | tid="140251518558208" timestamp=1745372003
+Apr 22 18:33:23 launchpad ollama[1550]: time=2025-04-22T18:33:23.249-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 18:33:29 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:33:29 | 200 |  7.950536678s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:34:59 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:34:59 | 200 |  3.708373562s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:36:15 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:36:15 | 200 |  4.578915411s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:37:25 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:37:25 | 200 |  3.434990383s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:39:24 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:39:24 | 200 |  4.592553143s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.264-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9071296512 required="6.2 GiB"
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.264-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.8 GiB" free_swap="68.9 GiB"
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.265-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.265-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45871"
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.266-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.266-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.266-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 18:55:41 launchpad ollama[680809]: INFO [main] build info | build=0 commit="unknown" tid="139642144657408" timestamp=1745373341
+Apr 22 18:55:41 launchpad ollama[680809]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139642144657408" timestamp=1745373341 total_threads=16
+Apr 22 18:55:41 launchpad ollama[680809]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45871" tid="139642144657408" timestamp=1745373341
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.517-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 18:55:41 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 18:55:41 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 18:55:41 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 18:55:41 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 18:55:42 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 18:55:42 launchpad ollama[680809]: INFO [main] model loaded | tid="139642144657408" timestamp=1745373342
+Apr 22 18:55:42 launchpad ollama[1550]: time=2025-04-22T18:55:42.521-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 18:55:49 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:55:49 | 200 |   8.29654315s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:58:29 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:58:29 | 200 |  6.561886447s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 19:00:50 launchpad ollama[1550]: [GIN] 2025/04/22 - 19:00:50 | 200 |  7.304756304s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 19:04:27 launchpad ollama[1550]: [GIN] 2025/04/22 - 19:04:27 | 200 |  6.741603191s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:20:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:20:54 | 200 |     640.676µs |       127.0.0.1 | GET      "/api/tags"
+Apr 23 11:20:55 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:20:55 | 200 |      24.144µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 11:20:58 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:20:58 | 200 |      36.017µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.497-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9196929024 required="6.2 GiB"
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.497-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.1 GiB" free_swap="68.9 GiB"
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.498-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.499-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43495"
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.499-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.499-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.499-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 11:23:26 launchpad ollama[704084]: INFO [main] build info | build=0 commit="unknown" tid="140569704751104" timestamp=1745432606
+Apr 23 11:23:26 launchpad ollama[704084]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140569704751104" timestamp=1745432606 total_threads=16
+Apr 23 11:23:26 launchpad ollama[704084]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43495" tid="140569704751104" timestamp=1745432606
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.750-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 11:23:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 11:23:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 11:23:26 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 11:23:26 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 11:23:27 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 11:23:27 launchpad ollama[704084]: INFO [main] model loaded | tid="140569704751104" timestamp=1745432607
+Apr 23 11:23:27 launchpad ollama[1550]: time=2025-04-23T11:23:27.754-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 11:23:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:23:33 | 200 |  7.387660865s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:23:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:23:34 | 200 |  1.187274349s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:23:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:23:37 | 200 |  2.783100881s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.460-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9211609088 required="6.2 GiB"
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.460-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.1 GiB" free_swap="68.9 GiB"
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.460-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.461-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38373"
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.462-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.462-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.462-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 11:29:12 launchpad ollama[704979]: INFO [main] build info | build=0 commit="unknown" tid="140119599300608" timestamp=1745432952
+Apr 23 11:29:12 launchpad ollama[704979]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140119599300608" timestamp=1745432952 total_threads=16
+Apr 23 11:29:12 launchpad ollama[704979]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38373" tid="140119599300608" timestamp=1745432952
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.712-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 11:29:12 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 11:29:12 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 11:29:12 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 11:29:12 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 11:29:13 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 11:29:13 launchpad ollama[704979]: INFO [main] model loaded | tid="140119599300608" timestamp=1745432953
+Apr 23 11:29:13 launchpad ollama[1550]: time=2025-04-23T11:29:13.716-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 11:29:19 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:29:19 | 200 |  7.602748082s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:31:01 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:31:01 | 200 |  6.725284712s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.789-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.789-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.789-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.790-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34019"
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.790-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.790-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.790-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 11:38:35 launchpad ollama[706422]: INFO [main] build info | build=0 commit="unknown" tid="140606098849792" timestamp=1745433515
+Apr 23 11:38:35 launchpad ollama[706422]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140606098849792" timestamp=1745433515 total_threads=16
+Apr 23 11:38:35 launchpad ollama[706422]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34019" tid="140606098849792" timestamp=1745433515
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 11:38:36 launchpad ollama[1550]: time=2025-04-23T11:38:36.041-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 11:38:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 11:38:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 11:38:36 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 11:38:36 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 11:38:36 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 11:38:36 launchpad ollama[706422]: INFO [main] model loaded | tid="140606098849792" timestamp=1745433516
+Apr 23 11:38:37 launchpad ollama[1550]: time=2025-04-23T11:38:37.045-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 11:38:45 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:38:45 | 200 | 10.146176166s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.477-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.477-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.477-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.478-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36333"
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.478-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.478-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.478-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 11:50:26 launchpad ollama[708177]: INFO [main] build info | build=0 commit="unknown" tid="140246380457984" timestamp=1745434226
+Apr 23 11:50:26 launchpad ollama[708177]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140246380457984" timestamp=1745434226 total_threads=16
+Apr 23 11:50:26 launchpad ollama[708177]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36333" tid="140246380457984" timestamp=1745434226
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.729-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 11:50:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 11:50:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 11:50:26 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 11:50:26 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 11:50:27 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 11:50:27 launchpad ollama[708177]: INFO [main] model loaded | tid="140246380457984" timestamp=1745434227
+Apr 23 11:50:27 launchpad ollama[1550]: time=2025-04-23T11:50:27.732-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 11:50:36 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:50:36 | 200 |  9.728679535s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.390-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297723392 required="6.2 GiB"
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.390-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.390-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.391-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34477"
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.391-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.391-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.391-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 11:58:22 launchpad ollama[709417]: INFO [main] build info | build=0 commit="unknown" tid="140169853362176" timestamp=1745434702
+Apr 23 11:58:22 launchpad ollama[709417]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140169853362176" timestamp=1745434702 total_threads=16
+Apr 23 11:58:22 launchpad ollama[709417]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34477" tid="140169853362176" timestamp=1745434702
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.642-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 11:58:22 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 11:58:22 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 11:58:22 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 11:58:22 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 11:58:23 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 11:58:23 launchpad ollama[709417]: INFO [main] model loaded | tid="140169853362176" timestamp=1745434703
+Apr 23 11:58:23 launchpad ollama[1550]: time=2025-04-23T11:58:23.645-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 11:58:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:58:33 | 200 | 11.194826934s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:01:24 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:01:24 | 200 |   7.31735944s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.137-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9295101952 required="6.2 GiB"
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.137-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.137-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.138-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44523"
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.138-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.138-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.138-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 12:11:51 launchpad ollama[711454]: INFO [main] build info | build=0 commit="unknown" tid="140476077305856" timestamp=1745435511
+Apr 23 12:11:51 launchpad ollama[711454]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140476077305856" timestamp=1745435511 total_threads=16
+Apr 23 12:11:51 launchpad ollama[711454]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44523" tid="140476077305856" timestamp=1745435511
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.389-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 12:11:51 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 12:11:51 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 12:11:51 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 12:11:51 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 12:11:52 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 12:11:52 launchpad ollama[711454]: INFO [main] model loaded | tid="140476077305856" timestamp=1745435512
+Apr 23 12:11:52 launchpad ollama[1550]: time=2025-04-23T12:11:52.393-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 12:11:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:11:59 | 200 |  8.701291304s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.173-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9182248960 required="6.2 GiB"
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.173-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.9 GiB" free_swap="68.9 GiB"
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.173-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.174-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38925"
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.174-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.174-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.174-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 12:31:18 launchpad ollama[714582]: INFO [main] build info | build=0 commit="unknown" tid="139797635899392" timestamp=1745436678
+Apr 23 12:31:18 launchpad ollama[714582]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139797635899392" timestamp=1745436678 total_threads=16
+Apr 23 12:31:18 launchpad ollama[714582]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38925" tid="139797635899392" timestamp=1745436678
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.425-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 12:31:18 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 12:31:18 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 12:31:18 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 12:31:18 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 12:31:19 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 12:31:19 launchpad ollama[714582]: INFO [main] model loaded | tid="139797635899392" timestamp=1745436679
+Apr 23 12:31:19 launchpad ollama[1550]: time=2025-04-23T12:31:19.429-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 12:31:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:31:26 | 200 |  8.914395683s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.731-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9199026176 required="6.2 GiB"
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.731-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.8 GiB" free_swap="68.9 GiB"
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.731-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.732-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41947"
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.732-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.732-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.733-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 12:40:30 launchpad ollama[715978]: INFO [main] build info | build=0 commit="unknown" tid="139621129306112" timestamp=1745437230
+Apr 23 12:40:30 launchpad ollama[715978]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139621129306112" timestamp=1745437230 total_threads=16
+Apr 23 12:40:30 launchpad ollama[715978]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41947" tid="139621129306112" timestamp=1745437230
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.983-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 12:40:30 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 12:40:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 12:40:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 12:40:31 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 12:40:31 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 12:40:31 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 12:40:31 launchpad ollama[715978]: INFO [main] model loaded | tid="139621129306112" timestamp=1745437231
+Apr 23 12:40:31 launchpad ollama[1550]: time=2025-04-23T12:40:31.986-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 12:40:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:40:39 | 200 |  8.635022503s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:42:18 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:42:18 | 200 |  9.352342256s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:44:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:44:21 | 200 |  9.578402887s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:46:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:46:15 | 200 |  8.397961733s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:50:03 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:50:03 | 200 | 11.400203648s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.098-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297723392 required="6.2 GiB"
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.098-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.098-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.099-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37239"
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.099-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.099-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.099-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 12:56:39 launchpad ollama[718453]: INFO [main] build info | build=0 commit="unknown" tid="140095530123264" timestamp=1745438199
+Apr 23 12:56:39 launchpad ollama[718453]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140095530123264" timestamp=1745438199 total_threads=16
+Apr 23 12:56:39 launchpad ollama[718453]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37239" tid="140095530123264" timestamp=1745438199
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.351-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 12:56:39 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 12:56:39 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 12:56:39 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 12:56:39 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 12:56:40 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 12:56:40 launchpad ollama[718453]: INFO [main] model loaded | tid="140095530123264" timestamp=1745438200
+Apr 23 12:56:40 launchpad ollama[1550]: time=2025-04-23T12:56:40.355-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 12:56:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:56:50 | 200 | 11.124863688s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.688-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.688-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.688-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.689-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34377"
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.689-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.690-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.690-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 13:04:09 launchpad ollama[719592]: INFO [main] build info | build=0 commit="unknown" tid="139719604162560" timestamp=1745438649
+Apr 23 13:04:09 launchpad ollama[719592]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139719604162560" timestamp=1745438649 total_threads=16
+Apr 23 13:04:09 launchpad ollama[719592]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34377" tid="139719604162560" timestamp=1745438649
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.940-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 13:04:09 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 13:04:09 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 13:04:09 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 13:04:09 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 13:04:10 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 13:04:10 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 13:04:10 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 13:04:10 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 13:04:10 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 13:04:10 launchpad ollama[719592]: INFO [main] model loaded | tid="139719604162560" timestamp=1745438650
+Apr 23 13:04:10 launchpad ollama[1550]: time=2025-04-23T13:04:10.944-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 13:04:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:04:21 | 200 | 11.869875333s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.726-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.727-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.727-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.728-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36641"
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.728-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.728-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.728-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 13:09:25 launchpad ollama[720418]: INFO [main] build info | build=0 commit="unknown" tid="139783658434560" timestamp=1745438965
+Apr 23 13:09:25 launchpad ollama[720418]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139783658434560" timestamp=1745438965 total_threads=16
+Apr 23 13:09:25 launchpad ollama[720418]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36641" tid="139783658434560" timestamp=1745438965
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.979-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 13:09:25 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 13:09:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 13:09:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 13:09:26 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 13:09:26 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 13:09:26 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 13:09:26 launchpad ollama[720418]: INFO [main] model loaded | tid="139783658434560" timestamp=1745438966
+Apr 23 13:09:26 launchpad ollama[1550]: time=2025-04-23T13:09:26.983-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 13:09:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:09:39 | 200 | 14.102146572s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.601-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297199104 required="6.2 GiB"
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.601-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.601-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.602-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40583"
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.603-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.603-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.603-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 13:21:40 launchpad ollama[722240]: INFO [main] build info | build=0 commit="unknown" tid="139956267622400" timestamp=1745439700
+Apr 23 13:21:40 launchpad ollama[722240]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139956267622400" timestamp=1745439700 total_threads=16
+Apr 23 13:21:40 launchpad ollama[722240]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40583" tid="139956267622400" timestamp=1745439700
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.854-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 13:21:40 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 13:21:40 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 13:21:40 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 13:21:40 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 13:21:41 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 13:21:41 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 13:21:41 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 13:21:41 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 13:21:41 launchpad ollama[722240]: INFO [main] model loaded | tid="139956267622400" timestamp=1745439701
+Apr 23 13:21:41 launchpad ollama[1550]: time=2025-04-23T13:21:41.857-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 13:21:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:21:53 | 200 | 12.953755348s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:25:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:25:52 | 200 |  8.075097905s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.326-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.326-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.6 GiB" free_swap="68.9 GiB"
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.326-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.327-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40585"
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.327-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.327-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.327-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 13:35:13 launchpad ollama[724277]: INFO [main] build info | build=0 commit="unknown" tid="140497420255232" timestamp=1745440513
+Apr 23 13:35:13 launchpad ollama[724277]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140497420255232" timestamp=1745440513 total_threads=16
+Apr 23 13:35:13 launchpad ollama[724277]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40585" tid="140497420255232" timestamp=1745440513
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.578-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 13:35:13 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 13:35:13 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 13:35:13 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 13:35:13 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 13:35:14 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 13:35:14 launchpad ollama[724277]: INFO [main] model loaded | tid="140497420255232" timestamp=1745440514
+Apr 23 13:35:14 launchpad ollama[1550]: time=2025-04-23T13:35:14.582-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 13:35:24 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:35:24 | 200 | 11.142479482s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:37:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:37:51 | 200 | 10.230882447s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:41:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:41:43 | 200 |  9.864597151s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.913-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9190637568 required="6.2 GiB"
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.913-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.914-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.915-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44127"
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.915-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.915-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.915-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 13:53:33 launchpad ollama[727133]: INFO [main] build info | build=0 commit="unknown" tid="139936180211712" timestamp=1745441613
+Apr 23 13:53:33 launchpad ollama[727133]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139936180211712" timestamp=1745441613 total_threads=16
+Apr 23 13:53:33 launchpad ollama[727133]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44127" tid="139936180211712" timestamp=1745441613
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 13:53:34 launchpad ollama[1550]: time=2025-04-23T13:53:34.166-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 13:53:34 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 13:53:34 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 13:53:34 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 13:53:34 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 13:53:34 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 13:53:34 launchpad ollama[727133]: INFO [main] model loaded | tid="139936180211712" timestamp=1745441614
+Apr 23 13:53:35 launchpad ollama[1550]: time=2025-04-23T13:53:35.170-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 13:53:44 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:53:44 | 200 | 11.276115175s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.716-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.716-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.8 GiB" free_swap="68.9 GiB"
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.716-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.717-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42567"
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.717-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.717-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.718-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 14:07:25 launchpad ollama[729184]: INFO [main] build info | build=0 commit="unknown" tid="139757118943232" timestamp=1745442445
+Apr 23 14:07:25 launchpad ollama[729184]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139757118943232" timestamp=1745442445 total_threads=16
+Apr 23 14:07:25 launchpad ollama[729184]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42567" tid="139757118943232" timestamp=1745442445
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.968-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 14:07:25 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 14:07:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 14:07:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 14:07:26 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 14:07:26 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 14:07:26 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 14:07:26 launchpad ollama[729184]: INFO [main] model loaded | tid="139757118943232" timestamp=1745442446
+Apr 23 14:07:26 launchpad ollama[1550]: time=2025-04-23T14:07:26.972-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 14:07:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:07:35 | 200 | 10.357410818s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:10:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:10:51 | 200 | 13.678362324s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:15:25 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:15:25 | 200 | 11.600829043s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.738-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297723392 required="6.2 GiB"
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.738-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.738-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.739-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45449"
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.739-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.739-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.739-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 14:26:36 launchpad ollama[732078]: INFO [main] build info | build=0 commit="unknown" tid="140011585576960" timestamp=1745443596
+Apr 23 14:26:36 launchpad ollama[732078]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140011585576960" timestamp=1745443596 total_threads=16
+Apr 23 14:26:36 launchpad ollama[732078]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45449" tid="140011585576960" timestamp=1745443596
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.991-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 14:26:37 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 14:26:37 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 14:26:37 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 14:26:37 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 14:26:37 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 14:26:37 launchpad ollama[732078]: INFO [main] model loaded | tid="140011585576960" timestamp=1745443597
+Apr 23 14:26:37 launchpad ollama[1550]: time=2025-04-23T14:26:37.994-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 14:26:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:26:48 | 200 | 12.380089776s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.435-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.435-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.6 GiB" free_swap="68.9 GiB"
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.436-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.436-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44737"
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.437-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.437-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.437-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 14:33:01 launchpad ollama[733036]: INFO [main] build info | build=0 commit="unknown" tid="140588807684096" timestamp=1745443981
+Apr 23 14:33:01 launchpad ollama[733036]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140588807684096" timestamp=1745443981 total_threads=16
+Apr 23 14:33:01 launchpad ollama[733036]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44737" tid="140588807684096" timestamp=1745443981
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.688-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 14:33:01 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 14:33:01 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 14:33:01 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 14:33:01 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 14:33:02 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 14:33:02 launchpad ollama[733036]: INFO [main] model loaded | tid="140588807684096" timestamp=1745443982
+Apr 23 14:33:02 launchpad ollama[1550]: time=2025-04-23T14:33:02.692-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 14:33:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:33:11 | 200 | 10.308262643s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.895-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9200533504 required="6.2 GiB"
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.895-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.6 GiB" free_swap="68.9 GiB"
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.895-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.897-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46227"
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.897-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.897-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.898-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 14:38:39 launchpad ollama[734094]: INFO [main] build info | build=0 commit="unknown" tid="140104576258048" timestamp=1745444319
+Apr 23 14:38:39 launchpad ollama[734094]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140104576258048" timestamp=1745444319 total_threads=16
+Apr 23 14:38:39 launchpad ollama[734094]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46227" tid="140104576258048" timestamp=1745444319
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 14:38:40 launchpad ollama[1550]: time=2025-04-23T14:38:40.149-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 14:38:40 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 14:38:40 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 14:38:40 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 14:38:40 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 14:38:40 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 14:38:40 launchpad ollama[734094]: INFO [main] model loaded | tid="140104576258048" timestamp=1745444320
+Apr 23 14:38:41 launchpad ollama[1550]: time=2025-04-23T14:38:41.152-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 14:38:45 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:38:45 | 200 |  5.917909773s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:38:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:38:50 | 200 |  4.403901904s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:38:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:38:54 | 200 |  4.295810322s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:38:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:38:54 | 200 |  463.056432ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:41:16 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:41:16 | 200 |  1.867684278s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:41:17 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:41:17 | 200 |  217.566499ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:41:17 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:41:17 | 200 |  218.140938ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:41:19 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:41:19 | 200 |  1.080943491s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:41:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:41:21 | 200 |  2.308630604s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:42:37 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3631 n_keep=24 n_left=2024 n_shift=1012 tid="140104576258048" timestamp=1745444557
+Apr 23 14:42:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:42:39 | 200 |  1.929403943s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:42:39 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="140104576258048" timestamp=1745444559
+Apr 23 14:42:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:42:40 | 200 |  1.099211505s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:42:40 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="140104576258048" timestamp=1745444560
+Apr 23 14:42:41 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:42:41 | 200 |  1.100569352s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:42:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:42:43 | 200 |  1.433242062s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:42:43 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1075 n_keep=24 n_left=2024 n_shift=1012 tid="140104576258048" timestamp=1745444563
+Apr 23 14:42:44 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:42:44 | 200 |  1.481279163s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:43:24 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1163 n_keep=24 n_left=2024 n_shift=1012 tid="140104576258048" timestamp=1745444604
+Apr 23 14:43:25 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:43:25 | 200 |  1.646370369s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:43:25 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:43:25 | 200 |  177.227665ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:43:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:43:26 | 200 |  176.892154ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:43:27 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:43:27 | 200 |  1.364166657s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:43:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:43:29 | 200 |  1.899849366s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:44:10 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:10 | 200 |  789.526405ms |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:44:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:11 | 200 |   172.84459ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:44:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:11 | 200 |  175.194876ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:44:12 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:12 | 200 |  1.259187511s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:44:14 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:14 | 200 |  1.532774908s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:44:46 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="140104576258048" timestamp=1745444686
+Apr 23 14:44:46 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:46 | 200 |  1.041423442s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:44:47 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="140104576258048" timestamp=1745444687
+Apr 23 14:44:47 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:47 | 200 |  1.003305675s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:44:49 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:49 | 200 |  1.321671086s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:44:49 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1075 n_keep=24 n_left=2024 n_shift=1012 tid="140104576258048" timestamp=1745444689
+Apr 23 14:44:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:51 | 200 |  2.183832595s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:46:44 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:46:44 | 200 |  232.136796ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:46:44 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:46:44 | 200 |  212.298351ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:46:45 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:46:45 | 200 |  880.720113ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:46:47 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:46:47 | 200 |  2.426639671s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:51:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:51:15 | 200 |  253.139884ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:51:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:51:15 | 200 |  224.385177ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:51:16 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:51:16 | 200 |  1.057310693s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:51:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:51:21 | 200 |  4.634745846s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:54:25 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:54:25 | 200 |  205.010376ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:54:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:54:26 | 200 |  191.526085ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:54:27 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:54:27 | 200 |  1.061523535s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:54:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:54:33 | 200 |  5.795074344s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.182-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297133568 required="6.2 GiB"
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.182-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.6 GiB" free_swap="68.9 GiB"
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.182-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.184-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36557"
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.184-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.184-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.184-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 15:08:36 launchpad ollama[738663]: INFO [main] build info | build=0 commit="unknown" tid="140478615900160" timestamp=1745446116
+Apr 23 15:08:36 launchpad ollama[738663]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140478615900160" timestamp=1745446116 total_threads=16
+Apr 23 15:08:36 launchpad ollama[738663]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36557" tid="140478615900160" timestamp=1745446116
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.436-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 15:08:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 15:08:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 15:08:36 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 15:08:36 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 15:08:37 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 15:08:37 launchpad ollama[738663]: INFO [main] model loaded | tid="140478615900160" timestamp=1745446117
+Apr 23 15:08:37 launchpad ollama[1550]: time=2025-04-23T15:08:37.439-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 15:08:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:37 | 200 |  1.752160476s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:08:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:38 | 200 |  316.550556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:08:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:38 | 200 |  913.865381ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:08:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:40 | 200 |  1.524003863s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:08:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:52 | 200 |  162.847618ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:08:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:52 | 200 |  162.958158ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:08:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:53 | 200 |  1.119670421s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:08:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:54 | 200 |  1.321134401s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:09:22 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:09:22 | 200 |  209.634616ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:09:23 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:09:23 | 200 |  186.861717ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:09:24 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:09:24 | 200 |  1.087752103s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:09:30 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:09:30 | 200 |  6.284829003s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.591-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297657856 required="6.2 GiB"
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.591-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.5 GiB" free_swap="68.9 GiB"
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.591-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.593-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39331"
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.593-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.593-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.593-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 15:29:05 launchpad ollama[741770]: INFO [main] build info | build=0 commit="unknown" tid="139751249104896" timestamp=1745447345
+Apr 23 15:29:05 launchpad ollama[741770]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139751249104896" timestamp=1745447345 total_threads=16
+Apr 23 15:29:05 launchpad ollama[741770]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39331" tid="139751249104896" timestamp=1745447345
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.844-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 15:29:05 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 15:29:05 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 15:29:05 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 15:29:05 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 15:29:06 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 15:29:06 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 15:29:06 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 15:29:06 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 15:29:06 launchpad ollama[741770]: INFO [main] model loaded | tid="139751249104896" timestamp=1745447346
+Apr 23 15:29:06 launchpad ollama[1550]: time=2025-04-23T15:29:06.848-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 15:29:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:29:07 | 200 |  1.783493384s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:29:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:29:07 | 200 |  352.608519ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:29:08 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:29:08 | 200 |  935.295941ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:29:16 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:29:16 | 200 |  7.495554825s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:31:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:31:35 | 200 |  206.886892ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:31:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:31:35 | 200 |  187.438793ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:31:36 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:31:36 | 200 |  1.066432478s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:31:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:31:43 | 200 |   7.29251208s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:34:05 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:34:05 | 200 |  225.521346ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:34:06 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:34:06 | 200 |    211.0022ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:34:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:34:07 | 200 |  1.134593019s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:34:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:34:15 | 200 |  8.252084922s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.290-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299755008 required="6.2 GiB"
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.290-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.5 GiB" free_swap="68.9 GiB"
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.290-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.292-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40299"
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.292-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.292-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.292-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 15:43:25 launchpad ollama[743911]: INFO [main] build info | build=0 commit="unknown" tid="140164261195776" timestamp=1745448205
+Apr 23 15:43:25 launchpad ollama[743911]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140164261195776" timestamp=1745448205 total_threads=16
+Apr 23 15:43:25 launchpad ollama[743911]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40299" tid="140164261195776" timestamp=1745448205
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.543-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 15:43:25 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 15:43:25 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 15:43:25 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 15:43:25 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 15:43:26 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 15:43:26 launchpad ollama[743911]: INFO [main] model loaded | tid="140164261195776" timestamp=1745448206
+Apr 23 15:43:26 launchpad ollama[1550]: time=2025-04-23T15:43:26.546-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 15:43:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:43:26 | 200 |   1.66424257s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:43:27 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:43:27 | 200 |  229.121682ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:43:27 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:43:27 | 200 |  922.185553ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:43:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:43:34 | 200 |  6.808054244s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.247-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297657856 required="6.2 GiB"
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.248-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.5 GiB" free_swap="68.9 GiB"
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.248-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.249-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36393"
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.249-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.249-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.249-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 15:48:58 launchpad ollama[744744]: INFO [main] build info | build=0 commit="unknown" tid="139678188761088" timestamp=1745448538
+Apr 23 15:48:58 launchpad ollama[744744]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139678188761088" timestamp=1745448538 total_threads=16
+Apr 23 15:48:58 launchpad ollama[744744]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36393" tid="139678188761088" timestamp=1745448538
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.501-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 15:48:58 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 15:48:58 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 15:48:58 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 15:48:58 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 15:48:59 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 15:48:59 launchpad ollama[744744]: INFO [main] model loaded | tid="139678188761088" timestamp=1745448539
+Apr 23 15:48:59 launchpad ollama[1550]: time=2025-04-23T15:48:59.504-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 15:48:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:48:59 | 200 |  1.641345052s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:48:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:48:59 | 200 |  200.495439ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:49:00 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:49:00 | 200 |  918.072418ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:49:03 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:49:03 | 200 |    2.7918668s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:50:20 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:50:20 | 200 |  197.142027ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:50:20 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:50:20 | 200 |  184.553055ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:50:22 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:50:22 | 200 |  1.218516621s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:50:22 launchpad ollama[744744]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="139678188761088" timestamp=1745448622
+Apr 23 15:50:24 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:50:24 | 200 |  2.043965397s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:51:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:51:33 | 200 |   204.39221ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:51:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:51:33 | 200 |  183.047103ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:51:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:51:34 | 200 |  1.156643663s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:51:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:51:40 | 200 |  5.534130701s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:53:45 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:53:45 | 200 |  207.214064ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:53:45 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:53:45 | 200 |   194.09473ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:53:46 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:53:46 | 200 |  1.079799274s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:53:49 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:53:49 | 200 |  2.536857476s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:54:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:54:40 | 200 |  208.041304ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:54:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:54:40 | 200 |  194.694403ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:54:41 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:54:41 | 200 |  1.229778061s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:54:46 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:54:46 | 200 |  5.179118839s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.921-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297657856 required="6.2 GiB"
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.921-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.4 GiB" free_swap="68.9 GiB"
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.921-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.923-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41021"
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.923-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.923-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.923-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 16:01:51 launchpad ollama[746687]: INFO [main] build info | build=0 commit="unknown" tid="139822820651008" timestamp=1745449311
+Apr 23 16:01:51 launchpad ollama[746687]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139822820651008" timestamp=1745449311 total_threads=16
+Apr 23 16:01:51 launchpad ollama[746687]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41021" tid="139822820651008" timestamp=1745449311
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 16:01:52 launchpad ollama[1550]: time=2025-04-23T16:01:52.174-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 16:01:52 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 16:01:52 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 16:01:52 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 16:01:52 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 16:01:52 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 16:01:52 launchpad ollama[746687]: INFO [main] model loaded | tid="139822820651008" timestamp=1745449312
+Apr 23 16:01:53 launchpad ollama[1550]: time=2025-04-23T16:01:53.178-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 16:01:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:01:53 | 200 |  1.706338953s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:01:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:01:53 | 200 |  261.222902ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:01:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:01:54 | 200 |   904.29523ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:01:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:01:57 | 200 |  3.144308181s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:02:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:02:07 | 200 |  169.518046ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:02:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:02:07 | 200 |  170.777694ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:02:08 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:02:08 | 200 |  1.163610601s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:02:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:02:11 | 200 |  3.037664362s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:03:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:03:56 | 200 |   233.25358ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:03:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:03:57 | 200 |  211.941961ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:03:58 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:03:58 | 200 |  1.150480481s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:03:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:03:59 | 200 |  1.591883084s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:07:01 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:07:01 | 200 |  649.941245ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:07:01 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:07:01 | 200 |  620.256672ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:07:02 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:07:02 | 200 |  1.254070605s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:07:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:07:07 | 200 |  4.512974526s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:08:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:11 | 200 |  217.346016ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:08:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:11 | 200 |  190.966444ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:08:12 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:12 | 200 |  1.203683956s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:08:20 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:20 | 200 |  8.272710161s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:08:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:52 | 200 |  212.727769ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:08:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:52 | 200 |  191.783099ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:08:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:53 | 200 |  1.240007945s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:08:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:57 | 200 |  4.110226989s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:10:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:10:33 | 200 |  205.415213ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:10:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:10:33 | 200 |  195.861841ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:10:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:10:35 | 200 |  1.258611531s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:10:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:10:38 | 200 |  3.649750446s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:11:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:11:43 | 200 |  198.033315ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:11:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:11:43 | 200 |  184.186461ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:11:44 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:11:44 | 200 |   1.19371359s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:11:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:11:53 | 200 |   8.97736957s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:14:04 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:14:04 | 200 |  262.971922ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:14:05 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:14:05 | 200 |  229.765107ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:14:06 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:14:06 | 200 |  1.269428039s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:14:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:14:11 | 200 |   5.09113946s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:15:42 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:15:42 | 200 |  204.459233ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:15:42 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:15:42 | 200 |   184.23007ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:15:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:15:43 | 200 |  892.046879ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:15:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:15:51 | 200 |  7.945472274s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:19:02 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:19:02 | 200 |  225.963756ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:19:02 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:19:02 | 200 |  214.736421ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:19:03 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:19:03 | 200 |  973.320591ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:19:08 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:19:08 | 200 |  4.787175055s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:23:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:23:48 | 200 |  193.996215ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:23:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:23:48 | 200 |   181.40443ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:23:49 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:23:49 | 200 |  1.249854336s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:23:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:23:52 | 200 |  2.889936862s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:25:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:25:50 | 200 |  207.587684ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:25:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:25:50 | 200 |  190.318979ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:25:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:25:51 | 200 |  1.116785127s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:25:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:25:54 | 200 |  3.055486338s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.719-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9206300672 required="6.2 GiB"
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.719-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.1 GiB" free_swap="68.9 GiB"
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.719-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.721-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37471"
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.721-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.721-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.721-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 16:37:05 launchpad ollama[752373]: INFO [main] build info | build=0 commit="unknown" tid="140589304725504" timestamp=1745451425
+Apr 23 16:37:05 launchpad ollama[752373]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140589304725504" timestamp=1745451425 total_threads=16
+Apr 23 16:37:05 launchpad ollama[752373]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37471" tid="140589304725504" timestamp=1745451425
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.971-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 16:37:05 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 16:37:06 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 16:37:06 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 16:37:06 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 16:37:06 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 16:37:06 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 16:37:06 launchpad ollama[752373]: INFO [main] model loaded | tid="140589304725504" timestamp=1745451426
+Apr 23 16:37:06 launchpad ollama[1550]: time=2025-04-23T16:37:06.975-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 16:37:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:37:07 | 200 |  1.715130566s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:37:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:37:07 | 200 |  277.059508ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:37:08 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:37:08 | 200 |  993.749274ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:37:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:37:15 | 200 |  7.161703414s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:38:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:38:51 | 200 |  214.272645ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:38:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:38:51 | 200 |  194.126252ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:38:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:38:52 | 200 |   1.22954814s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:38:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:38:57 | 200 |  5.296634418s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:43:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:43:38 | 200 |  270.675166ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:43:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:43:38 | 200 |  256.470661ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:43:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:43:39 | 200 |  1.217220835s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:43:46 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:43:46 | 200 |  6.269839591s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:45:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:45:21 | 200 |  200.567383ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:45:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:45:21 | 200 |  181.252238ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:45:22 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:45:22 | 200 |  1.158915419s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:45:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:45:26 | 200 |   4.41121607s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.211-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9300279296 required="6.2 GiB"
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.211-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.1 GiB" free_swap="68.9 GiB"
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.211-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.213-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36365"
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.213-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.213-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.213-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 16:54:56 launchpad ollama[755027]: INFO [main] build info | build=0 commit="unknown" tid="139849870778368" timestamp=1745452496
+Apr 23 16:54:56 launchpad ollama[755027]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139849870778368" timestamp=1745452496 total_threads=16
+Apr 23 16:54:56 launchpad ollama[755027]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36365" tid="139849870778368" timestamp=1745452496
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.464-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 16:54:56 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 16:54:56 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 16:54:56 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 16:54:56 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 16:54:57 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 16:54:57 launchpad ollama[755027]: INFO [main] model loaded | tid="139849870778368" timestamp=1745452497
+Apr 23 16:54:57 launchpad ollama[1550]: time=2025-04-23T16:54:57.468-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 16:54:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:54:57 | 200 |  1.733030575s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:54:58 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:54:58 | 200 |  291.119439ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:54:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:54:59 | 200 |  969.844968ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:55:05 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:55:05 | 200 |  6.301849199s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.094-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9300803584 required="6.2 GiB"
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.094-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.094-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.096-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43175"
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.096-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.096-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.096-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 17:08:14 launchpad ollama[757052]: INFO [main] build info | build=0 commit="unknown" tid="140225427890176" timestamp=1745453294
+Apr 23 17:08:14 launchpad ollama[757052]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140225427890176" timestamp=1745453294 total_threads=16
+Apr 23 17:08:14 launchpad ollama[757052]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43175" tid="140225427890176" timestamp=1745453294
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.347-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 17:08:14 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 17:08:14 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 17:08:14 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 17:08:14 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 17:08:15 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 17:08:15 launchpad ollama[757052]: INFO [main] model loaded | tid="140225427890176" timestamp=1745453295
+Apr 23 17:08:15 launchpad ollama[1550]: time=2025-04-23T17:08:15.350-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 17:08:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:08:15 | 200 |  1.655003829s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:08:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:08:15 | 200 |  218.820867ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:08:16 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:08:16 | 200 |  945.013966ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:08:22 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:08:22 | 200 |  5.842121313s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.791-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9300803584 required="6.2 GiB"
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.791-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.792-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.793-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43739"
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.793-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.793-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.794-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 17:15:54 launchpad ollama[758218]: INFO [main] build info | build=0 commit="unknown" tid="139965543305216" timestamp=1745453754
+Apr 23 17:15:54 launchpad ollama[758218]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139965543305216" timestamp=1745453754 total_threads=16
+Apr 23 17:15:54 launchpad ollama[758218]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43739" tid="139965543305216" timestamp=1745453754
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 17:15:55 launchpad ollama[1550]: time=2025-04-23T17:15:55.045-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 17:15:55 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 17:15:55 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 17:15:55 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 17:15:55 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 17:15:55 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 17:15:55 launchpad ollama[758218]: INFO [main] model loaded | tid="139965543305216" timestamp=1745453755
+Apr 23 17:15:56 launchpad ollama[1550]: time=2025-04-23T17:15:56.049-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 17:15:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:15:56 | 200 |  1.665246282s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:15:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:15:56 | 200 |  228.248626ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:15:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:15:57 | 200 |  908.149474ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:16:03 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:16:03 | 200 |  6.195206239s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.716-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9300279296 required="6.2 GiB"
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.716-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.716-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.718-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 32867"
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.718-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.718-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.718-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 17:24:35 launchpad ollama[759652]: INFO [main] build info | build=0 commit="unknown" tid="139971416432640" timestamp=1745454275
+Apr 23 17:24:35 launchpad ollama[759652]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139971416432640" timestamp=1745454275 total_threads=16
+Apr 23 17:24:35 launchpad ollama[759652]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32867" tid="139971416432640" timestamp=1745454275
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.969-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 17:24:35 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 17:24:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 17:24:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 17:24:36 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 17:24:36 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 17:24:36 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 17:24:36 launchpad ollama[759652]: INFO [main] model loaded | tid="139971416432640" timestamp=1745454276
+Apr 23 17:24:36 launchpad ollama[1550]: time=2025-04-23T17:24:36.973-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 17:24:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:24:37 | 200 |  1.715784962s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:24:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:24:37 | 200 |  272.002243ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:24:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:24:38 | 200 |  891.992735ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:24:44 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:24:44 | 200 |   6.42036935s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.556-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9296609280 required="6.2 GiB"
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.556-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.556-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.558-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43877"
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.558-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.558-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.558-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 17:29:54 launchpad ollama[760480]: INFO [main] build info | build=0 commit="unknown" tid="139761445388288" timestamp=1745454594
+Apr 23 17:29:54 launchpad ollama[760480]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139761445388288" timestamp=1745454594 total_threads=16
+Apr 23 17:29:54 launchpad ollama[760480]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43877" tid="139761445388288" timestamp=1745454594
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.809-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 17:29:54 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 17:29:54 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 17:29:54 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 17:29:54 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 17:29:55 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 17:29:55 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 17:29:55 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 17:29:55 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 17:29:55 launchpad ollama[760480]: INFO [main] model loaded | tid="139761445388288" timestamp=1745454595
+Apr 23 17:29:55 launchpad ollama[1550]: time=2025-04-23T17:29:55.812-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 17:29:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:29:56 | 200 |  1.647709406s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:29:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:29:56 | 200 |  218.040739ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:29:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:29:57 | 200 |  931.171771ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:30:03 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:30:03 | 200 |  6.553792836s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:33:41 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:33:41 | 200 |  5.513200945s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:33:47 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:33:47 | 200 |  5.951761173s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:33:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:33:52 | 200 |  5.324029464s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:33:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:33:57 | 200 |  4.988908165s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:03 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:03 | 200 |  5.481583284s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:08 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:08 | 200 |  5.636927445s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:13 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:13 | 200 |  4.973832389s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:20 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:20 | 200 |  6.429673412s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:26 | 200 |  6.170376806s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:31 | 200 |  5.472332692s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:37 | 200 |  5.745230658s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:43 | 200 |  5.290740391s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:48 | 200 |  5.373433295s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:53 | 200 |  5.298728666s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:59 | 200 |  5.631392826s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:04 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:04 | 200 |  5.330692538s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:09 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:09 | 200 |  5.099768709s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:15 | 200 |  5.463271492s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:21 | 200 |  5.622586806s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:26 | 200 |  5.416842498s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:31 | 200 |   5.38208324s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:37 | 200 |  5.115517484s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:42 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:42 | 200 |  5.388257945s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:48 | 200 |  5.700935744s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:53 | 200 |  4.827407192s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:59 | 200 |  5.795170652s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:04 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:04 | 200 |  5.613308453s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:10 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:10 | 200 |  5.792461306s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:15 | 200 |  5.441378647s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:21 | 200 |  5.054836714s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:26 | 200 |  5.354167418s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:31 | 200 |  5.127645916s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:35 | 200 |  3.498165595s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:55 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:55 | 200 |  1.124623728s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:36:55 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:55 | 200 |   190.57658ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:56 | 200 |  191.353147ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:57 | 200 |  1.177496813s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:57 | 200 |  109.674102ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:57 | 200 |  193.270449ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:58 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:58 | 200 |  925.295133ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:37:04 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:37:04 | 200 |  5.495618968s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:41:32 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:32 | 200 |   1.35067355s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:41:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:33 | 200 |  236.562616ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:41:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:33 | 200 |  235.158709ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:41:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:34 | 200 |  1.151250864s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:41:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:34 | 200 |    113.8258ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:41:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:34 | 200 |  195.821102ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:41:36 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:36 | 200 |  1.156124603s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:41:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:43 | 200 |  7.450401236s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:46:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:46:56 | 200 |      25.745µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 17:48:14 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:48:14 | 200 |      24.329µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 17:48:17 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:48:17 | 200 |      26.686µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.189-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9115729920 required="6.2 GiB"
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.189-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.2 GiB" free_swap="68.9 GiB"
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.189-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.191-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43095"
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.191-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.191-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.191-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 17:49:13 launchpad ollama[763723]: INFO [main] build info | build=0 commit="unknown" tid="139627022245888" timestamp=1745455753
+Apr 23 17:49:13 launchpad ollama[763723]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139627022245888" timestamp=1745455753 total_threads=16
+Apr 23 17:49:13 launchpad ollama[763723]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43095" tid="139627022245888" timestamp=1745455753
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.442-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 17:49:13 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 17:49:13 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 17:49:13 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 17:49:13 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 17:49:14 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 17:49:14 launchpad ollama[763723]: INFO [main] model loaded | tid="139627022245888" timestamp=1745455754
+Apr 23 17:49:14 launchpad ollama[1550]: time=2025-04-23T17:49:14.446-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 17:49:14 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:14 | 200 |  1.621844807s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:49:14 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:14 | 200 |  178.194889ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:49:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:15 | 200 |  986.923716ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:49:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:15 | 200 |   95.944006ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:49:16 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:16 | 200 |  177.628998ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:49:17 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:17 | 200 |  902.883751ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:49:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:21 | 200 |  4.746923904s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:50:06 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:50:06 | 200 |     624.041µs |       127.0.0.1 | GET      "/api/tags"
+Apr 23 17:50:10 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:50:10 | 200 |       29.31µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 17:51:09 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:51:09 | 200 |  201.501533ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:51:09 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:51:09 | 200 |  188.590244ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:51:10 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:51:10 | 200 |  1.141851794s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:51:14 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:51:14 | 200 |  4.147923929s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:51:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:51:15 | 200 |  484.995762ms |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:51:17 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:51:17 | 200 |  2.396052006s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:52:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:52:31 | 200 |  194.429789ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:52:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:52:31 | 200 |  174.473469ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:52:32 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:52:32 | 200 |  1.072062433s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:52:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:52:35 | 200 |  2.747763494s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:53:00 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:53:00 | 200 |       33.53µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 17:54:28 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:28 | 200 |  205.822094ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:54:28 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:28 | 200 |  191.084425ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:54:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:29 | 200 |  1.216033327s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:54:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:29 | 200 |  107.195288ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:54:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:29 | 200 |  185.226544ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:54:30 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:30 | 200 |  957.627968ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:54:36 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:36 | 200 |  5.847705896s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:55:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:55:59 | 200 |  199.620826ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:55:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:55:59 | 200 |  187.281075ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:56:01 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:56:01 | 200 |  1.391120446s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:56:04 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:56:04 | 200 |  3.404000606s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:56:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:56:57 | 200 |  196.926397ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:56:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:56:57 | 200 |  182.929148ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:56:58 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:56:58 | 200 |  1.340671128s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:57:00 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:57:00 | 200 |  1.768564865s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:00:22 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:00:22 | 200 |      84.783µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 18:03:00 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:03:00 | 200 |      41.155µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 18:04:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:04:50 | 200 |     590.418µs |       127.0.0.1 | GET      "/api/tags"
+Apr 23 18:05:06 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:05:06 | 200 |     590.438µs |       127.0.0.1 | GET      "/api/tags"
+Apr 23 18:05:32 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:05:32 | 200 |      24.818µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 18:05:42 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:05:42 | 200 |       24.93µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.076-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9156427776 required="6.2 GiB"
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.076-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.9 GiB" free_swap="68.9 GiB"
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.076-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.078-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44079"
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.078-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.078-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.078-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 18:07:46 launchpad ollama[766850]: INFO [main] build info | build=0 commit="unknown" tid="140340407275520" timestamp=1745456866
+Apr 23 18:07:46 launchpad ollama[766850]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140340407275520" timestamp=1745456866 total_threads=16
+Apr 23 18:07:46 launchpad ollama[766850]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44079" tid="140340407275520" timestamp=1745456866
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.329-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 18:07:46 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 18:07:46 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 18:07:46 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 18:07:46 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 18:07:47 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 18:07:47 launchpad ollama[766850]: INFO [main] model loaded | tid="140340407275520" timestamp=1745456867
+Apr 23 18:07:47 launchpad ollama[1550]: time=2025-04-23T18:07:47.332-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 18:07:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:48 | 200 |  2.548020507s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:07:49 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:49 | 200 |  1.111278657s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:07:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:50 | 200 |  1.148013528s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:07:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:51 | 200 |  1.095264503s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:07:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:53 | 200 |  1.150553696s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:07:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:54 | 200 |  1.095690186s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:07:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:59 | 200 |  4.989010415s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:08:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:37 | 200 |  201.013496ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:08:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:38 | 200 |  186.500957ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:08:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:39 | 200 |  1.124483688s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:08:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:39 | 200 |  103.263269ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:08:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:39 | 200 |  182.676138ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:08:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:40 | 200 |  1.222491182s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:08:46 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:46 | 200 |  5.496173932s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:11:16 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:16 | 200 |  241.451316ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:11:17 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:17 | 200 |  224.899449ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:11:18 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:18 | 200 |  1.166300681s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:11:18 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:18 | 200 |  143.182785ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:11:18 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:18 | 200 |  225.587214ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:11:19 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:19 | 200 |  1.280338269s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:11:27 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:27 | 200 |  7.219407188s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.138-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9196863488 required="6.2 GiB"
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.138-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.7 GiB" free_swap="68.9 GiB"
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.138-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.140-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33497"
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.140-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.140-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.140-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 18:18:18 launchpad ollama[768669]: INFO [main] build info | build=0 commit="unknown" tid="139795465789440" timestamp=1745457498
+Apr 23 18:18:18 launchpad ollama[768669]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139795465789440" timestamp=1745457498 total_threads=16
+Apr 23 18:18:18 launchpad ollama[768669]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33497" tid="139795465789440" timestamp=1745457498
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.390-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 18:18:18 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 18:18:18 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 18:18:18 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 18:18:18 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 18:18:19 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 18:18:19 launchpad ollama[768669]: INFO [main] model loaded | tid="139795465789440" timestamp=1745457499
+Apr 23 18:18:19 launchpad ollama[1550]: time=2025-04-23T18:18:19.395-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 18:18:19 launchpad ollama[768669]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="139795465789440" timestamp=1745457499
+Apr 23 18:18:20 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:20 | 200 |   2.35511796s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:18:20 launchpad ollama[768669]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="139795465789440" timestamp=1745457500
+Apr 23 18:18:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:21 | 200 |  910.036838ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:18:22 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:22 | 200 |  1.081006708s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:18:22 launchpad ollama[768669]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="139795465789440" timestamp=1745457502
+Apr 23 18:18:23 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:23 | 200 |  832.260372ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:18:23 launchpad ollama[768669]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="139795465789440" timestamp=1745457503
+Apr 23 18:18:24 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:24 | 200 |  911.856095ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:18:25 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:25 | 200 |   1.26152979s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:18:25 launchpad ollama[768669]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1661 n_keep=24 n_left=2024 n_shift=1012 tid="139795465789440" timestamp=1745457505
+Apr 23 18:18:30 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:30 | 200 |  4.607431092s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:19:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:35 | 200 |  195.270104ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:19:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:35 | 200 |  182.918735ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:19:36 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:36 | 200 |  1.129369059s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:19:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:37 | 200 |   97.142778ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:19:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:37 | 200 |  176.400084ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:19:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:38 | 200 |  1.102297743s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:19:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:40 | 200 |   2.25545781s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:22:27 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:27 | 200 |  613.790527ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:22:28 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:28 | 200 |  579.137029ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:22:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:29 | 200 |  1.237361934s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:22:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:29 | 200 |  503.335489ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:22:30 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:30 | 200 |  605.293164ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:22:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:31 | 200 |  988.700665ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:22:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:35 | 200 |  4.192560785s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:23:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:29 | 200 |  194.169262ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:29 | 200 |  180.586316ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:30 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:30 | 200 |  1.140924415s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:30 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:30 | 200 |   60.542204ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:31 | 200 |  181.996175ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:32 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:32 | 200 |  1.130667312s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:34 | 200 |  1.894084877s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:23:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:48 | 200 |  173.878237ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:48 | 200 |   174.43425ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:49 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:49 | 200 |  981.156606ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:50 | 200 |  129.357036ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:50 | 200 |  169.839608ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:51 | 200 |  747.644882ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:53 | 200 |  2.242286178s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 08:40:18 launchpad ollama[1550]: [GIN] 2025/04/24 - 08:40:18 | 200 |     618.408µs |       127.0.0.1 | GET      "/api/tags"
+Apr 24 08:40:18 launchpad ollama[1550]: [GIN] 2025/04/24 - 08:40:18 | 200 |      28.394µs |       127.0.0.1 | GET      "/api/version"
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.589-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9296543744 required="6.2 GiB"
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.589-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.6 GiB" free_swap="68.9 GiB"
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.590-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.591-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44627"
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.591-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.591-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.591-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 08:42:17 launchpad ollama[792029]: INFO [main] build info | build=0 commit="unknown" tid="139752163717120" timestamp=1745509337
+Apr 24 08:42:17 launchpad ollama[792029]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139752163717120" timestamp=1745509337 total_threads=16
+Apr 24 08:42:17 launchpad ollama[792029]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44627" tid="139752163717120" timestamp=1745509337
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.842-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 08:42:17 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 08:42:17 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 08:42:17 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 08:42:17 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 08:42:18 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 08:42:18 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 08:42:18 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 08:42:18 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 08:42:18 launchpad ollama[792029]: INFO [main] model loaded | tid="139752163717120" timestamp=1745509338
+Apr 24 08:42:18 launchpad ollama[1550]: time=2025-04-24T08:42:18.845-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 08:42:24 launchpad ollama[1550]: [GIN] 2025/04/24 - 08:42:24 | 200 |  6.819686708s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.002-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9200992256 required="6.2 GiB"
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.002-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.002-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.003-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38937"
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.003-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.003-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.004-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 08:53:33 launchpad ollama[795550]: INFO [main] build info | build=0 commit="unknown" tid="139898628452352" timestamp=1745510013
+Apr 24 08:53:33 launchpad ollama[795550]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139898628452352" timestamp=1745510013 total_threads=16
+Apr 24 08:53:33 launchpad ollama[795550]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38937" tid="139898628452352" timestamp=1745510013
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.255-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 08:53:33 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 08:53:33 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 08:53:33 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 08:53:33 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 08:53:33 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 08:53:34 launchpad ollama[795550]: INFO [main] model loaded | tid="139898628452352" timestamp=1745510014
+Apr 24 08:53:34 launchpad ollama[1550]: time=2025-04-24T08:53:34.258-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 08:53:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 08:53:40 | 200 |   7.32566291s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 08:54:23 launchpad ollama[1550]: [GIN] 2025/04/24 - 08:54:23 | 200 |  5.712938789s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 08:56:32 launchpad ollama[1550]: [GIN] 2025/04/24 - 08:56:32 | 200 |   4.05194468s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.887-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297854464 required="6.2 GiB"
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.887-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.5 GiB" free_swap="68.9 GiB"
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.888-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.888-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43035"
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.889-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.889-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.889-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 09:07:28 launchpad ollama[801791]: INFO [main] build info | build=0 commit="unknown" tid="140587686600704" timestamp=1745510848
+Apr 24 09:07:28 launchpad ollama[801791]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140587686600704" timestamp=1745510848 total_threads=16
+Apr 24 09:07:28 launchpad ollama[801791]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43035" tid="140587686600704" timestamp=1745510848
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 09:07:29 launchpad ollama[1550]: time=2025-04-24T09:07:29.139-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 09:07:29 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 09:07:29 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 09:07:29 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 09:07:29 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 09:07:29 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 09:07:29 launchpad ollama[801791]: INFO [main] model loaded | tid="140587686600704" timestamp=1745510849
+Apr 24 09:07:30 launchpad ollama[1550]: time=2025-04-24T09:07:30.143-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 09:07:36 launchpad ollama[1550]: [GIN] 2025/04/24 - 09:07:36 | 200 |  7.880096058s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 09:11:52 launchpad ollama[1550]: [GIN] 2025/04/24 - 09:11:52 | 200 |  5.226571422s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 09:13:00 launchpad ollama[1550]: [GIN] 2025/04/24 - 09:13:00 | 200 |  4.637749177s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.633-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9295757312 required="6.2 GiB"
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.633-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.5 GiB" free_swap="68.9 GiB"
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.633-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.634-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33465"
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.634-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.634-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.635-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 09:19:10 launchpad ollama[804911]: INFO [main] build info | build=0 commit="unknown" tid="139956127256576" timestamp=1745511550
+Apr 24 09:19:10 launchpad ollama[804911]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139956127256576" timestamp=1745511550 total_threads=16
+Apr 24 09:19:10 launchpad ollama[804911]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33465" tid="139956127256576" timestamp=1745511550
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.886-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 09:19:10 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 09:19:10 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 09:19:10 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 09:19:10 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 09:19:11 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 09:19:11 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 09:19:11 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 09:19:11 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 09:19:11 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 09:19:11 launchpad ollama[804911]: INFO [main] model loaded | tid="139956127256576" timestamp=1745511551
+Apr 24 09:19:11 launchpad ollama[1550]: time=2025-04-24T09:19:11.890-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 24 09:19:18 launchpad ollama[1550]: [GIN] 2025/04/24 - 09:19:18 | 200 |  7.632560487s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.324-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9097248768 required="6.2 GiB"
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.324-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.325-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.326-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34541"
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.326-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.326-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.326-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 09:40:31 launchpad ollama[811227]: INFO [main] build info | build=0 commit="unknown" tid="140467655323648" timestamp=1745512831
+Apr 24 09:40:31 launchpad ollama[811227]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140467655323648" timestamp=1745512831 total_threads=16
+Apr 24 09:40:31 launchpad ollama[811227]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34541" tid="140467655323648" timestamp=1745512831
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.577-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 09:40:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 09:40:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 09:40:31 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 09:40:31 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 09:40:32 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 09:40:32 launchpad ollama[811227]: INFO [main] model loaded | tid="140467655323648" timestamp=1745512832
+Apr 24 09:40:32 launchpad ollama[1550]: time=2025-04-24T09:40:32.581-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 24 09:40:41 launchpad ollama[1550]: [GIN] 2025/04/24 - 09:40:41 | 200 |  9.911681647s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.517-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9051439104 required="6.2 GiB"
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.517-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.518-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.518-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33395"
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.519-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.519-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.519-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 09:59:53 launchpad ollama[817802]: INFO [main] build info | build=0 commit="unknown" tid="140658720501760" timestamp=1745513993
+Apr 24 09:59:53 launchpad ollama[817802]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140658720501760" timestamp=1745513993 total_threads=16
+Apr 24 09:59:53 launchpad ollama[817802]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33395" tid="140658720501760" timestamp=1745513993
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.769-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 09:59:53 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 09:59:53 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 09:59:53 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 09:59:53 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 09:59:54 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 09:59:54 launchpad ollama[817802]: INFO [main] model loaded | tid="140658720501760" timestamp=1745513994
+Apr 24 09:59:54 launchpad ollama[1550]: time=2025-04-24T09:59:54.774-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 24 10:00:00 launchpad ollama[1550]: [GIN] 2025/04/24 - 10:00:00 | 200 |  7.142717191s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 10:03:27 launchpad ollama[1550]: [GIN] 2025/04/24 - 10:03:27 | 200 |  8.348778676s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.302-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8985182208 required="6.2 GiB"
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.302-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.2 GiB" free_swap="68.9 GiB"
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.302-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.303-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41833"
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.304-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.304-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.304-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 10:09:07 launchpad ollama[820062]: INFO [main] build info | build=0 commit="unknown" tid="139819060563968" timestamp=1745514547
+Apr 24 10:09:07 launchpad ollama[820062]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139819060563968" timestamp=1745514547 total_threads=16
+Apr 24 10:09:07 launchpad ollama[820062]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41833" tid="139819060563968" timestamp=1745514547
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.554-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 10:09:07 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 10:09:07 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 10:09:07 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 10:09:07 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 10:09:08 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 10:09:08 launchpad ollama[820062]: INFO [main] model loaded | tid="139819060563968" timestamp=1745514548
+Apr 24 10:09:08 launchpad ollama[1550]: time=2025-04-24T10:09:08.558-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 10:09:13 launchpad ollama[1550]: [GIN] 2025/04/24 - 10:09:13 | 200 |  6.476721392s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.087-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9072279552 required="6.2 GiB"
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.087-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.8 GiB" free_swap="68.9 GiB"
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.087-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.088-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35685"
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.088-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.088-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.089-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 10:39:25 launchpad ollama[828764]: INFO [main] build info | build=0 commit="unknown" tid="139716123848704" timestamp=1745516365
+Apr 24 10:39:25 launchpad ollama[828764]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139716123848704" timestamp=1745516365 total_threads=16
+Apr 24 10:39:25 launchpad ollama[828764]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35685" tid="139716123848704" timestamp=1745516365
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.339-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 10:39:25 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 10:39:25 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 10:39:25 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 10:39:25 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 10:39:26 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 10:39:26 launchpad ollama[828764]: INFO [main] model loaded | tid="139716123848704" timestamp=1745516366
+Apr 24 10:39:26 launchpad ollama[1550]: time=2025-04-24T10:39:26.343-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 10:39:34 launchpad ollama[1550]: [GIN] 2025/04/24 - 10:39:34 | 200 |  9.279915837s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 10:42:31 launchpad ollama[1550]: [GIN] 2025/04/24 - 10:42:31 | 200 |  8.154876417s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 11:11:08 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:11:08 | 200 |       688.1µs |       127.0.0.1 | GET      "/api/tags"
+Apr 24 11:11:08 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:11:08 | 200 |      24.276µs |       127.0.0.1 | GET      "/api/version"
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.161-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9106620416 required="6.2 GiB"
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.161-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.0 GiB" free_swap="68.9 GiB"
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.162-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.163-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46613"
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.163-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.163-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.163-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 11:11:53 launchpad ollama[836914]: INFO [main] build info | build=0 commit="unknown" tid="140295154032640" timestamp=1745518313
+Apr 24 11:11:53 launchpad ollama[836914]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140295154032640" timestamp=1745518313 total_threads=16
+Apr 24 11:11:53 launchpad ollama[836914]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46613" tid="140295154032640" timestamp=1745518313
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.414-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 11:11:53 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 11:11:53 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 11:11:53 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 11:11:53 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 11:11:54 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 11:11:54 launchpad ollama[836914]: INFO [main] model loaded | tid="140295154032640" timestamp=1745518314
+Apr 24 11:11:54 launchpad ollama[1550]: time=2025-04-24T11:11:54.418-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 24 11:12:00 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:12:00 | 200 |  7.164835811s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 11:13:23 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:13:23 | 200 |  3.528328901s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 11:52:49 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:52:49 | 200 |     767.714µs |       127.0.0.1 | GET      "/api/tags"
+Apr 24 11:52:49 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:52:49 | 200 |      29.886µs |       127.0.0.1 | GET      "/api/version"
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.258-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9094627328 required="6.2 GiB"
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.258-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.258-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.259-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45549"
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.259-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.259-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.259-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 11:53:05 launchpad ollama[849688]: INFO [main] build info | build=0 commit="unknown" tid="140288459870208" timestamp=1745520785
+Apr 24 11:53:05 launchpad ollama[849688]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140288459870208" timestamp=1745520785 total_threads=16
+Apr 24 11:53:05 launchpad ollama[849688]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45549" tid="140288459870208" timestamp=1745520785
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.510-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 11:53:05 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 11:53:05 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 11:53:05 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 11:53:05 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 11:53:06 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 11:53:06 launchpad ollama[849688]: INFO [main] model loaded | tid="140288459870208" timestamp=1745520786
+Apr 24 11:53:06 launchpad ollama[1550]: time=2025-04-24T11:53:06.514-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 11:53:07 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:53:07 | 200 |  2.185362823s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 11:53:55 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:53:55 | 200 |  197.529098ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 11:53:55 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:53:55 | 200 |  182.645663ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 11:53:56 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:53:56 | 200 |  794.044825ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 11:54:01 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:54:01 | 200 |  4.798343251s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 17:36:21 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:36:21 | 200 |     989.017µs |       127.0.0.1 | GET      "/api/tags"
+Apr 24 17:36:21 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:36:21 | 200 |      31.076µs |       127.0.0.1 | GET      "/api/version"
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.498-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8993701888 required="6.2 GiB"
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.498-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.498-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.499-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45731"
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.500-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.500-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.500-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 17:37:24 launchpad ollama[907052]: INFO [main] build info | build=0 commit="unknown" tid="140650386157568" timestamp=1745541444
+Apr 24 17:37:24 launchpad ollama[907052]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140650386157568" timestamp=1745541444 total_threads=16
+Apr 24 17:37:24 launchpad ollama[907052]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45731" tid="140650386157568" timestamp=1745541444
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.750-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 17:37:24 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 17:37:24 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 17:37:24 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 17:37:24 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 17:37:25 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 17:37:25 launchpad ollama[907052]: INFO [main] model loaded | tid="140650386157568" timestamp=1745541445
+Apr 24 17:37:25 launchpad ollama[1550]: time=2025-04-24T17:37:25.755-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 17:37:26 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:26 | 200 |  2.090572142s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:37:27 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:27 | 200 |  641.604522ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:37:28 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:28 | 200 |   1.12206052s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:37:28 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:28 | 200 |  602.114962ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:37:29 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:29 | 200 |  643.702023ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:37:30 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:30 | 200 |  947.871468ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:37:35 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:35 | 200 |  5.141216689s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 17:41:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:37 | 200 |  253.821343ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:41:38 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:38 | 200 |   233.74639ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:41:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:39 | 200 |  1.288696981s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:41:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:39 | 200 |  151.585566ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:41:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:39 | 200 |  230.968708ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:41:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:40 | 200 |  1.129138278s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:41:49 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:49 | 200 |  8.188421146s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.915-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9019129856 required="6.2 GiB"
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.915-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.915-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.917-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36907"
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.917-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.917-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.917-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 17:48:43 launchpad ollama[908846]: INFO [main] build info | build=0 commit="unknown" tid="140427190984704" timestamp=1745542123
+Apr 24 17:48:43 launchpad ollama[908846]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140427190984704" timestamp=1745542123 total_threads=16
+Apr 24 17:48:43 launchpad ollama[908846]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36907" tid="140427190984704" timestamp=1745542123
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 17:48:44 launchpad ollama[1550]: time=2025-04-24T17:48:44.168-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 17:48:44 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 17:48:44 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 17:48:44 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 17:48:44 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 17:48:44 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 17:48:44 launchpad ollama[908846]: INFO [main] model loaded | tid="140427190984704" timestamp=1745542124
+Apr 24 17:48:45 launchpad ollama[1550]: time=2025-04-24T17:48:45.173-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 24 17:48:45 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:45 | 200 |  1.710108976s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:48:45 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:45 | 200 |  261.643597ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:48:46 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:46 | 200 |  920.281896ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:48:46 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:46 | 200 |  140.932473ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:48:47 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:47 | 200 |  262.296399ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:48:48 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:48 | 200 |  931.731337ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:48:54 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:54 | 200 |  6.073523978s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 17:50:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:40 | 200 |  200.958593ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:50:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:40 | 200 |  184.805268ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:50:41 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:41 | 200 |  1.323112494s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:50:41 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:41 | 200 |  139.446503ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:50:41 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:41 | 200 |  179.133677ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:50:43 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:43 | 200 |  1.105684492s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:50:47 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:47 | 200 |  4.172603628s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 17:54:17 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:17 | 200 |  252.141997ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:54:17 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:17 | 200 |  230.037406ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:54:19 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:19 | 200 |  1.238542557s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:54:19 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:19 | 200 |  149.908375ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:54:19 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:19 | 200 |  233.893207ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:54:20 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:20 | 200 |  1.134658459s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:54:25 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:25 | 200 |  4.419488004s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 17:56:52 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:56:52 | 200 |  997.425782ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:56:53 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:56:53 | 200 |   976.49382ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:56:54 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:56:54 | 200 |  1.319597493s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:56:55 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:56:55 | 200 |  854.225048ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:56:56 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:56:56 | 200 |  978.069183ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:56:57 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:56:57 | 200 |  1.176739016s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:57:02 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:57:02 | 200 |  4.568611013s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:01:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:40 | 200 |  285.525117ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:01:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:40 | 200 |  263.224302ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:01:42 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:42 | 200 |  1.284613852s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:01:42 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:42 | 200 |  183.780122ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:01:42 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:42 | 200 |  265.112546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:01:44 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:44 | 200 |  1.308307094s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:01:49 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:49 | 200 |  5.455469807s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:02:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:39 | 200 |    190.2052ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:02:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:39 | 200 |  171.393986ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:02:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:40 | 200 |  1.207327682s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:02:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:40 | 200 |   85.131773ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:02:41 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:41 | 200 |  167.706202ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:02:41 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:41 | 200 |   740.56885ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:02:44 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:44 | 200 |  2.739863687s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:03:43 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:43 | 200 |  209.005892ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:03:44 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:44 | 200 |  192.816082ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:03:45 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:45 | 200 |  1.218627175s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:03:45 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:45 | 200 |  107.435739ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:03:45 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:45 | 200 |  189.349972ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:03:46 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:46 | 200 |  1.058925751s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:03:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:51 | 200 |  4.786284933s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:04:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:51 | 200 |  222.424954ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:04:52 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:52 | 200 |  197.341345ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:04:53 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:53 | 200 |   1.41089556s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:04:53 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:53 | 200 |  108.523858ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:04:53 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:53 | 200 |  191.368248ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:04:55 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:55 | 200 |  1.228077154s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:04:59 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:59 | 200 |  4.042051614s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:07:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:37 | 200 |  226.879035ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:07:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:37 | 200 |  199.111256ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:07:38 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:38 | 200 |  1.063590276s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:07:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:39 | 200 |   72.477831ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:07:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:39 | 200 |   195.27252ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:07:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:40 | 200 |  1.203461936s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:07:46 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:46 | 200 |   5.63723919s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:12:31 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:31 | 200 |  268.460203ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:12:31 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:31 | 200 |  255.364856ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:12:33 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:33 | 200 |  1.256656059s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:12:33 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:33 | 200 |  169.377662ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:12:33 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:33 | 200 |  249.408594ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:12:34 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:34 | 200 |  1.110234089s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:12:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:40 | 200 |  5.827003425s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:13:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:37 | 200 |  205.160617ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:13:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:37 | 200 |  188.095185ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:13:38 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:38 | 200 |  1.280872614s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:13:38 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:38 | 200 |  101.873219ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:13:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:39 | 200 |  184.533949ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:13:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:40 | 200 |  1.034372568s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:13:44 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:44 | 200 |  4.254397145s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:16:25 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:25 | 200 |  242.856859ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:16:25 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:25 | 200 |  223.365467ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:16:26 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:26 | 200 |  1.113844362s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:16:26 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:26 | 200 |  141.013523ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:16:26 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:26 | 200 |  218.619502ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:16:27 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:27 | 200 |  1.082765987s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:16:33 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:33 | 200 |  5.733372573s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.927-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8970436608 required="6.2 GiB"
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.927-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.0 GiB" free_swap="68.9 GiB"
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.928-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.929-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46517"
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.929-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.929-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.929-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 18:39:05 launchpad ollama[919991]: INFO [main] build info | build=0 commit="unknown" tid="140099898662912" timestamp=1745545145
+Apr 24 18:39:05 launchpad ollama[919991]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140099898662912" timestamp=1745545145 total_threads=16
+Apr 24 18:39:05 launchpad ollama[919991]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46517" tid="140099898662912" timestamp=1745545145
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 18:39:06 launchpad ollama[1550]: time=2025-04-24T18:39:06.181-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 18:39:06 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 18:39:06 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 18:39:06 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 18:39:06 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 18:39:06 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 18:39:06 launchpad ollama[919991]: INFO [main] model loaded | tid="140099898662912" timestamp=1745545146
+Apr 24 18:39:07 launchpad ollama[1550]: time=2025-04-24T18:39:07.184-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 18:39:07 launchpad ollama[919991]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3479 n_keep=24 n_left=2024 n_shift=1012 tid="140099898662912" timestamp=1745545147
+Apr 24 18:39:15 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:15 | 200 |  9.293126049s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:39:16 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:16 | 200 |   1.81402386s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:39:18 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:18 | 200 |  1.817578195s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:39:20 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:20 | 200 |  1.321442909s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:39:21 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:21 | 200 |  1.768898059s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:39:23 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:23 | 200 |  1.815065399s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:39:24 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:24 | 200 |   1.13415786s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:39:31 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:31 | 200 |  6.271196545s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:42:12 launchpad ollama[919991]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1645 n_keep=24 n_left=2024 n_shift=1012 tid="140099898662912" timestamp=1745545332
+Apr 24 18:42:20 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:20 | 200 |  8.327494275s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:42:20 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:20 | 200 |  225.432321ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:42:21 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:21 | 200 |  226.489769ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:42:22 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:22 | 200 |   1.16798263s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:42:22 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:22 | 200 |  144.278618ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:42:22 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:22 | 200 |  226.540793ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:42:24 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:24 | 200 |  1.284153205s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:42:27 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:27 | 200 |  3.720294838s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:43:34 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:34 | 200 |  1.024618794s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:43:35 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:35 | 200 |   179.97332ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:43:35 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:35 | 200 |   180.02421ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:43:36 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:36 | 200 |  1.090675027s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:43:36 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:36 | 200 |   98.967754ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:43:36 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:36 | 200 |  178.429106ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:43:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:37 | 200 |  857.260775ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:43:42 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:42 | 200 |  5.007495698s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:45:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:51 | 200 |  1.023029902s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:45:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:51 | 200 |  187.113902ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:45:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:51 | 200 |  183.078764ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:45:52 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:52 | 200 |  1.127197605s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:45:53 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:53 | 200 |  102.356162ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:45:53 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:53 | 200 |  185.324031ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:45:54 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:54 | 200 |  1.031321065s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:45:57 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:57 | 200 |  3.434281484s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:48:59 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:48:59 | 200 |  4.081209653s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:48:59 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:48:59 | 200 |   288.92447ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:49:00 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:49:00 | 200 |  286.772565ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:49:01 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:49:01 | 200 |  1.333809227s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:49:01 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:49:01 | 200 |  203.341443ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:49:01 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:49:01 | 200 |  285.595388ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:49:03 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:49:03 | 200 |  1.091997316s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:49:06 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:49:06 | 200 |  3.462818189s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:52:49 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:49 | 200 |  1.053412836s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:52:49 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:49 | 200 |  217.336173ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:52:50 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:50 | 200 |  216.634086ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:52:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:51 | 200 |  932.691712ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:52:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:51 | 200 |  135.776263ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:52:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:51 | 200 |  216.992277ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:52:52 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:52 | 200 |  1.014736642s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:52:57 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:57 | 200 |  5.298822783s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.186-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9081978880 required="6.2 GiB"
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.186-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.9 GiB" free_swap="68.9 GiB"
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.187-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.187-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40493"
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.188-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.188-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.188-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 18:58:32 launchpad ollama[923081]: INFO [main] build info | build=0 commit="unknown" tid="140547275657216" timestamp=1745546312
+Apr 24 18:58:32 launchpad ollama[923081]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140547275657216" timestamp=1745546312 total_threads=16
+Apr 24 18:58:32 launchpad ollama[923081]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40493" tid="140547275657216" timestamp=1745546312
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.439-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 18:58:32 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 18:58:32 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 18:58:32 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 18:58:32 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 18:58:33 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 18:58:33 launchpad ollama[923081]: INFO [main] model loaded | tid="140547275657216" timestamp=1745546313
+Apr 24 18:58:33 launchpad ollama[1550]: time=2025-04-24T18:58:33.441-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 18:58:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:37 | 200 |  5.044834878s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:58:54 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:54 | 200 |  285.297208ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:58:54 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:54 | 200 |   260.43005ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:58:55 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:55 | 200 |  952.027957ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:58:56 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:56 | 200 |  218.050077ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:58:56 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:56 | 200 |  261.842981ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:58:57 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:57 | 200 |  920.633754ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:59:04 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:59:04 | 200 |  7.195869601s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 19:01:47 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:47 | 200 |  192.035927ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:01:47 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:47 | 200 |  180.653514ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:01:48 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:48 | 200 |  1.179824626s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:01:48 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:48 | 200 |   96.208666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:01:48 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:48 | 200 |  178.822281ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:01:50 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:50 | 200 |  1.257093011s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:01:55 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:55 | 200 |  5.687962194s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 19:03:11 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:11 | 200 |  187.164947ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:03:11 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:11 | 200 |  169.136714ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:03:13 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:13 | 200 |  1.328971141s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:03:13 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:13 | 200 |   84.264482ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:03:13 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:13 | 200 |  166.751689ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:03:15 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:15 | 200 |  1.536837721s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:03:20 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:20 | 200 |  4.833257563s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:06:02 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:06:02 | 200 |     656.506µs |       127.0.0.1 | GET      "/api/tags"
+Apr 25 10:06:02 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:06:02 | 200 |      34.553µs |       127.0.0.1 | GET      "/api/version"
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.315-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9075621888 required="6.2 GiB"
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.315-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.0 GiB" free_swap="68.9 GiB"
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.315-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.317-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 32785"
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.317-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.317-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.317-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 10:13:21 launchpad ollama[935186]: INFO [main] build info | build=0 commit="unknown" tid="140636594302976" timestamp=1745601201
+Apr 25 10:13:21 launchpad ollama[935186]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140636594302976" timestamp=1745601201 total_threads=16
+Apr 25 10:13:21 launchpad ollama[935186]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32785" tid="140636594302976" timestamp=1745601201
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.568-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 25 10:13:21 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 10:13:21 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 10:13:21 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 10:13:21 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 10:13:22 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 25 10:13:22 launchpad ollama[935186]: INFO [main] model loaded | tid="140636594302976" timestamp=1745601202
+Apr 25 10:13:22 launchpad ollama[1550]: time=2025-04-25T10:13:22.571-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 25 10:13:23 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:23 | 200 |  2.603573209s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:13:24 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:24 | 200 |  1.154762644s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:13:25 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:25 | 200 |  912.044721ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:13:27 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:27 | 200 |   1.11257606s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:13:28 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:28 | 200 |  1.195339685s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:13:29 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:29 | 200 |  948.435089ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:13:31 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:31 | 200 |  2.700882535s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.670-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8978890752 required="6.2 GiB"
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.670-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.1 GiB" free_swap="68.9 GiB"
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.671-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.672-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44195"
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.672-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.672-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.672-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 10:19:23 launchpad ollama[936144]: INFO [main] build info | build=0 commit="unknown" tid="140383647281152" timestamp=1745601563
+Apr 25 10:19:23 launchpad ollama[936144]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140383647281152" timestamp=1745601563 total_threads=16
+Apr 25 10:19:23 launchpad ollama[936144]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44195" tid="140383647281152" timestamp=1745601563
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.924-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 25 10:19:23 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 10:19:23 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 10:19:23 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 10:19:23 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 10:19:24 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 25 10:19:24 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 25 10:19:24 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 25 10:19:24 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 10:19:24 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 25 10:19:24 launchpad ollama[936144]: INFO [main] model loaded | tid="140383647281152" timestamp=1745601564
+Apr 25 10:19:24 launchpad ollama[1550]: time=2025-04-25T10:19:24.927-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 25 10:19:25 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:25 | 200 |  1.663071879s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:19:25 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:25 | 200 |   202.88545ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:19:26 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:26 | 200 |   958.12173ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:19:26 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:26 | 200 |  159.322856ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:19:26 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:26 | 200 |  201.932104ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:19:27 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:27 | 200 |  804.617649ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:19:33 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:33 | 200 |   5.64176218s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.365-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8985313280 required="6.2 GiB"
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.365-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.0 GiB" free_swap="68.9 GiB"
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.365-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.367-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37745"
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.367-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.367-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.367-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 10:25:52 launchpad ollama[937189]: INFO [main] build info | build=0 commit="unknown" tid="139894329585664" timestamp=1745601952
+Apr 25 10:25:52 launchpad ollama[937189]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139894329585664" timestamp=1745601952 total_threads=16
+Apr 25 10:25:52 launchpad ollama[937189]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37745" tid="139894329585664" timestamp=1745601952
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.618-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 25 10:25:52 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 10:25:52 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 10:25:52 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 10:25:52 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 10:25:53 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 25 10:25:53 launchpad ollama[937189]: INFO [main] model loaded | tid="139894329585664" timestamp=1745601953
+Apr 25 10:25:53 launchpad ollama[1550]: time=2025-04-25T10:25:53.622-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 25 10:25:53 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:25:53 | 200 |  1.618330487s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:25:53 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:25:53 | 200 |  176.006639ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:25:54 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:25:54 | 200 |  1.012885767s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:25:55 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:25:55 | 200 |   52.663758ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:25:55 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:25:55 | 200 |  133.618171ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:25:56 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:25:56 | 200 |  1.337080749s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:26:03 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:26:03 | 200 |  7.192816778s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:27:06 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:06 | 200 |  216.879351ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:27:07 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:07 | 200 |  202.280451ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:27:08 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:08 | 200 |  1.258996057s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:27:08 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:08 | 200 |   113.90707ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:27:08 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:08 | 200 |  195.061757ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:27:10 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:10 | 200 |  1.358440515s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:27:14 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:14 | 200 |  4.660747669s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:31:24 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:24 | 200 |  262.036491ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:31:24 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:24 | 200 |  237.489923ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:31:25 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:25 | 200 |  1.239453145s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:31:26 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:26 | 200 |  196.405222ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:31:26 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:26 | 200 |  236.354144ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:31:27 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:27 | 200 |  1.020126427s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:31:30 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:30 | 200 |  2.991535105s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:42:39 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:42:39 | 200 |      26.374µs |       127.0.0.1 | GET      "/api/version"
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.408-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.569-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.0 GiB" free_swap="68.9 GiB"
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.570-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.571-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 40315"
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.571-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.571-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.571-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 10:44:56 launchpad ollama[940509]: INFO [main] build info | build=0 commit="unknown" tid="139938947252224" timestamp=1745603096
+Apr 25 10:44:56 launchpad ollama[940509]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139938947252224" timestamp=1745603096 total_threads=16
+Apr 25 10:44:56 launchpad ollama[940509]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40315" tid="139938947252224" timestamp=1745603096
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 25 10:44:56 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 10:44:56 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 10:44:56 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 10:44:56 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.876-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_tensors: offloading 36 repeating layers to GPU
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_tensors: offloaded 36/41 layers to GPU
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 10:44:57 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 48
+Apr 25 10:44:57 launchpad ollama[940509]: INFO [main] model loaded | tid="139938947252224" timestamp=1745603097
+Apr 25 10:44:57 launchpad ollama[1550]: time=2025-04-25T10:44:57.880-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 25 10:45:32 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:45:32 | 200 | 35.885928419s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:45:32 launchpad ollama[1550]: time=2025-04-25T10:45:32.360-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 10:45:34 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:45:34 | 200 |  2.174434507s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:45:34 launchpad ollama[1550]: time=2025-04-25T10:45:34.577-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 10:45:37 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:45:37 | 200 |  2.734380952s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:50:17 launchpad ollama[1550]: time=2025-04-25T10:50:17.553-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 10:50:46 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:50:46 | 200 | 29.139636083s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:53:00 launchpad ollama[1550]: time=2025-04-25T10:53:00.792-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 10:53:13 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:53:13 | 200 |   12.4257238s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:53:55 launchpad ollama[1550]: time=2025-04-25T10:53:55.887-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 10:54:45 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:54:45 | 200 | 50.075986347s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.698-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.865-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.9 GiB" free_swap="68.9 GiB"
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.865-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.866-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 36671"
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.866-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.866-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.866-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 11:02:02 launchpad ollama[972082]: INFO [main] build info | build=0 commit="unknown" tid="139775431499776" timestamp=1745604122
+Apr 25 11:02:02 launchpad ollama[972082]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139775431499776" timestamp=1745604122 total_threads=16
+Apr 25 11:02:02 launchpad ollama[972082]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36671" tid="139775431499776" timestamp=1745604122
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 25 11:02:02 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 11:02:02 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 11:02:02 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 11:02:02 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 11:02:03 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: time=2025-04-25T11:02:03.167-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 11:02:03 launchpad ollama[1550]: llm_load_tensors: offloading 36 repeating layers to GPU
+Apr 25 11:02:03 launchpad ollama[1550]: llm_load_tensors: offloaded 36/41 layers to GPU
+Apr 25 11:02:03 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 11:02:03 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 48
+Apr 25 11:02:03 launchpad ollama[972082]: INFO [main] model loaded | tid="139775431499776" timestamp=1745604123
+Apr 25 11:02:04 launchpad ollama[1550]: time=2025-04-25T11:02:04.170-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 25 11:02:20 launchpad ollama[1550]: [GIN] 2025/04/25 - 11:02:20 | 200 | 18.013133037s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 11:04:23 launchpad ollama[1550]: time=2025-04-25T11:04:23.808-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 11:04:47 launchpad ollama[1550]: [GIN] 2025/04/25 - 11:04:47 | 200 | 24.172131361s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 11:05:50 launchpad ollama[1550]: time=2025-04-25T11:05:50.437-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 11:06:04 launchpad ollama[1550]: [GIN] 2025/04/25 - 11:06:04 | 200 | 14.112729529s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 11:07:24 launchpad ollama[1550]: [GIN] 2025/04/25 - 11:07:24 | 200 |      28.291µs |       127.0.0.1 | GET      "/api/version"
+Apr 25 11:09:20 launchpad ollama[1550]: time=2025-04-25T11:09:20.794-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 11:11:22 launchpad ollama[1550]: [GIN] 2025/04/25 - 11:11:22 | 200 |          2m2s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 12:40:42 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:40:42 | 200 |      16.444µs |       127.0.0.1 | HEAD     "/"
+Apr 25 12:40:42 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:40:42 | 200 |      46.139µs |       127.0.0.1 | GET      "/api/ps"
+Apr 25 12:41:27 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:41:27 | 200 |     582.632µs |       127.0.0.1 | GET      "/api/tags"
+Apr 25 12:41:27 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:41:27 | 200 |      26.285µs |       127.0.0.1 | GET      "/api/version"
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.687-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.854-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.4 GiB" free_swap="68.9 GiB"
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.854-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.855-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 42199"
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.856-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.856-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.856-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 12:42:07 launchpad ollama[1049016]: INFO [main] build info | build=0 commit="unknown" tid="140557161463808" timestamp=1745610127
+Apr 25 12:42:07 launchpad ollama[1049016]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140557161463808" timestamp=1745610127 total_threads=16
+Apr 25 12:42:07 launchpad ollama[1049016]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42199" tid="140557161463808" timestamp=1745610127
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 25 12:42:07 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 12:42:07 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 12:42:07 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 12:42:07 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: time=2025-04-25T12:42:08.154-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 12:42:08 launchpad ollama[1550]: llm_load_tensors: offloading 36 repeating layers to GPU
+Apr 25 12:42:08 launchpad ollama[1550]: llm_load_tensors: offloaded 36/41 layers to GPU
+Apr 25 12:42:08 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 12:42:08 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 48
+Apr 25 12:42:08 launchpad ollama[1049016]: INFO [main] model loaded | tid="140557161463808" timestamp=1745610128
+Apr 25 12:42:09 launchpad ollama[1550]: time=2025-04-25T12:42:09.158-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 25 12:42:14 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:42:14 | 200 |       14.74µs |       127.0.0.1 | HEAD     "/"
+Apr 25 12:42:14 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:42:14 | 200 |      16.628µs |       127.0.0.1 | GET      "/api/ps"
+Apr 25 12:42:23 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:42:23 | 200 | 15.714145874s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 12:42:58 launchpad ollama[1550]: time=2025-04-25T12:42:58.623-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 12:43:14 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:43:14 | 200 | 15.942858189s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 12:45:12 launchpad ollama[1550]: time=2025-04-25T12:45:12.245-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 12:45:38 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:45:38 | 200 |      19.429µs |       127.0.0.1 | HEAD     "/"
+Apr 25 12:45:38 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:45:38 | 200 |      24.797µs |       127.0.0.1 | GET      "/api/ps"
+Apr 25 12:46:31 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:46:31 | 200 |         1m19s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 12:47:38 launchpad ollama[1550]: time=2025-04-25T12:47:38.974-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 12:48:01 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:48:01 | 200 | 22.109538778s |       127.0.0.1 | POST     "/api/chat"
diff --git a/flakes/tensorflow/ollama.log b/flakes/tensorflow/ollama.log
new file mode 100644
index 0000000..b233ce8
--- /dev/null
+++ b/flakes/tensorflow/ollama.log
@@ -0,0 +1,67607 @@
+Nov 29 11:13:56 launchpad systemd[1]: Stopping Server for local large language models...
+Nov 29 11:13:56 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Nov 29 11:13:56 launchpad systemd[1]: Stopped Server for local large language models.
+Nov 29 11:13:56 launchpad systemd[1]: ollama.service: Consumed 5.334s CPU time, no IP traffic.
+Nov 29 11:13:56 launchpad systemd[1]: Started Server for local large language models.
+Nov 29 11:13:56 launchpad ollama[549315]: 2024/11/29 11:13:56 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]"
+Nov 29 11:13:56 launchpad ollama[549315]: time=2024-11-29T11:13:56.805-08:00 level=INFO source=images.go:704 msg="total blobs: 18"
+Nov 29 11:13:56 launchpad ollama[549315]: time=2024-11-29T11:13:56.806-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0"
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
+Nov 29 11:13:56 launchpad ollama[549315]:  - using env:        export GIN_MODE=release
+Nov 29 11:13:56 launchpad ollama[549315]:  - using code:        gin.SetMode(gin.ReleaseMode)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST   /api/pull                 --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST   /api/generate             --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST   /api/chat                 --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST   /api/embeddings           --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST   /api/create               --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST   /api/push                 --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST   /api/copy                 --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] DELETE /api/delete               --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST   /api/show                 --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] HEAD   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] GET    /api/ps                   --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] POST   /v1/chat/completions      --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] GET    /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] GET    /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] GET    /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] HEAD   /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] HEAD   /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: [GIN-debug] HEAD   /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Nov 29 11:13:56 launchpad ollama[549315]: time=2024-11-29T11:13:56.806-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)"
+Nov 29 11:13:56 launchpad ollama[549315]: time=2024-11-29T11:13:56.806-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama4178292014/runners
+Nov 29 11:14:00 launchpad ollama[549315]: time=2024-11-29T11:14:00.642-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu cpu_avx cpu_avx2 cuda_v12]"
+Nov 29 11:14:00 launchpad ollama[549315]: time=2024-11-29T11:14:00.711-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="10.3 GiB"
+Nov 29 11:17:00 launchpad systemd[1]: Stopping Server for local large language models...
+Nov 29 11:17:00 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Nov 29 11:17:00 launchpad systemd[1]: Stopped Server for local large language models.
+Nov 29 11:17:00 launchpad systemd[1]: ollama.service: Consumed 5.182s CPU time, no IP traffic.
+-- Boot 47c23a229c4c4528a9dcd93e2546681d --
+Nov 29 11:18:42 launchpad systemd[1]: Started Server for local large language models.
+Nov 29 11:18:42 launchpad ollama[1650]: 2024/11/29 11:18:42 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]"
+Nov 29 11:18:42 launchpad ollama[1650]: time=2024-11-29T11:18:42.238-08:00 level=INFO source=images.go:704 msg="total blobs: 18"
+Nov 29 11:18:42 launchpad ollama[1650]: time=2024-11-29T11:18:42.243-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0"
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
+Nov 29 11:18:42 launchpad ollama[1650]:  - using env:        export GIN_MODE=release
+Nov 29 11:18:42 launchpad ollama[1650]:  - using code:        gin.SetMode(gin.ReleaseMode)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST   /api/pull                 --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST   /api/generate             --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST   /api/chat                 --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST   /api/embeddings           --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST   /api/create               --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST   /api/push                 --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST   /api/copy                 --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] DELETE /api/delete               --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST   /api/show                 --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] HEAD   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] GET    /api/ps                   --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] POST   /v1/chat/completions      --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] GET    /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] GET    /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] GET    /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] HEAD   /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] HEAD   /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: [GIN-debug] HEAD   /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Nov 29 11:18:42 launchpad ollama[1650]: time=2024-11-29T11:18:42.244-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)"
+Nov 29 11:18:42 launchpad ollama[1650]: time=2024-11-29T11:18:42.244-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama232997375/runners
+Nov 29 11:18:46 launchpad ollama[1650]: time=2024-11-29T11:18:46.244-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu cpu_avx cpu_avx2 cuda_v12]"
+Nov 29 11:18:46 launchpad ollama[1650]: time=2024-11-29T11:18:46.322-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.2 GiB"
+Nov 29 15:43:43 launchpad systemd[1]: Stopping Server for local large language models...
+Nov 29 15:43:43 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Nov 29 15:43:43 launchpad systemd[1]: Stopped Server for local large language models.
+Nov 29 15:43:43 launchpad systemd[1]: ollama.service: Consumed 5.606s CPU time, 1.1G memory peak, 0B memory swap peak, no IP traffic.
+-- Boot f721c7d615d94dafb053feaf5ec77243 --
+Nov 29 15:44:29 launchpad systemd[1]: Started Server for local large language models.
+Nov 29 15:44:29 launchpad ollama[1636]: 2024/11/29 15:44:29 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]"
+Nov 29 15:44:29 launchpad ollama[1636]: time=2024-11-29T15:44:29.739-08:00 level=INFO source=images.go:704 msg="total blobs: 18"
+Nov 29 15:44:29 launchpad ollama[1636]: time=2024-11-29T15:44:29.745-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0"
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
+Nov 29 15:44:29 launchpad ollama[1636]:  - using env:        export GIN_MODE=release
+Nov 29 15:44:29 launchpad ollama[1636]:  - using code:        gin.SetMode(gin.ReleaseMode)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST   /api/pull                 --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST   /api/generate             --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST   /api/chat                 --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST   /api/embeddings           --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST   /api/create               --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST   /api/push                 --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST   /api/copy                 --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] DELETE /api/delete               --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST   /api/show                 --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] HEAD   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] GET    /api/ps                   --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] POST   /v1/chat/completions      --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] GET    /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] GET    /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] GET    /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] HEAD   /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] HEAD   /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: [GIN-debug] HEAD   /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Nov 29 15:44:29 launchpad ollama[1636]: time=2024-11-29T15:44:29.745-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)"
+Nov 29 15:44:29 launchpad ollama[1636]: time=2024-11-29T15:44:29.746-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama3406801642/runners
+Nov 29 15:44:33 launchpad ollama[1636]: time=2024-11-29T15:44:33.917-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cuda_v12 cpu cpu_avx cpu_avx2]"
+Nov 29 15:44:34 launchpad ollama[1636]: time=2024-11-29T15:44:34.008-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.2 GiB"
+Dec 01 10:52:29 launchpad ollama[1636]: [GIN] 2024/12/01 - 10:52:29 | 200 |     877.976µs |       127.0.0.1 | HEAD     "/"
+Dec 01 10:52:29 launchpad ollama[1636]: [GIN] 2024/12/01 - 10:52:29 | 200 |    2.076826ms |       127.0.0.1 | POST     "/api/show"
+Dec 01 10:52:29 launchpad ollama[1636]: [GIN] 2024/12/01 - 10:52:29 | 200 |     303.403µs |       127.0.0.1 | POST     "/api/show"
+Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.808-08:00 level=WARN source=types.go:384 msg="invalid option provided" option=""
+Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.930-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=35 memory.available="8.1 GiB" memory.required.full="9.1 GiB" memory.required.partial="8.0 GiB" memory.required.kv="1.6 GiB" memory.weights.total="6.8 GiB" memory.weights.repeating="6.6 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.930-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=35 memory.available="8.1 GiB" memory.required.full="9.1 GiB" memory.required.partial="8.0 GiB" memory.required.kv="1.6 GiB" memory.weights.total="6.8 GiB" memory.weights.repeating="6.6 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.931-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=35 memory.available="8.1 GiB" memory.required.full="9.1 GiB" memory.required.partial="8.0 GiB" memory.required.kv="1.6 GiB" memory.weights.total="6.8 GiB" memory.weights.repeating="6.6 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.931-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama3406801642/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 35 --parallel 1 --port 46225"
+Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.931-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.931-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 01 10:52:29 launchpad ollama[1636]: time=2024-12-01T10:52:29.932-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 01 10:52:29 launchpad ollama[689882]: INFO [main] build info | build=0 commit="unknown" tid="140289154994176" timestamp=1733079149
+Dec 01 10:52:29 launchpad ollama[689882]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140289154994176" timestamp=1733079149 total_threads=16
+Dec 01 10:52:29 launchpad ollama[689882]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46225" tid="140289154994176" timestamp=1733079149
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - type  f32:   81 tensors
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - type q4_0:  281 tensors
+Dec 01 10:52:29 launchpad ollama[1636]: llama_model_loader: - type q6_K:    1 tensors
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_vocab: mismatch in special tokens definition ( 264/32016 vs 259/32016 ).
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: format           = GGUF V2
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: arch             = llama
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: vocab type       = SPM
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_vocab          = 32016
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_merges         = 0
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_embd           = 5120
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_head           = 40
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_head_kv        = 40
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_layer          = 40
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_rot            = 128
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_gqa            = 1
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_ff             = 13824
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_expert         = 0
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_expert_used    = 0
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: causal attn      = 1
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: pooling type     = 0
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: rope type        = 0
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: rope scaling     = linear
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: freq_scale_train = 1
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: n_yarn_orig_ctx  = 16384
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: ssm_d_state      = 0
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: model type       = 13B
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: model ftype      = Q4_0
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: model params     = 13.02 B
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: general.name     = codellama
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: BOS token        = 1 ''
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: EOS token        = 2 ''
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: UNK token        = 0 ''
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 01 10:52:29 launchpad ollama[1636]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 01 10:52:29 launchpad ollama[1636]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 01 10:52:29 launchpad ollama[1636]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 01 10:52:29 launchpad ollama[1636]: ggml_cuda_init: found 1 CUDA devices:
+Dec 01 10:52:29 launchpad ollama[1636]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 01 10:52:30 launchpad ollama[1636]: llm_load_tensors: ggml ctx size =    0.37 MiB
+Dec 01 10:52:30 launchpad ollama[1636]: time=2024-12-01T10:52:30.182-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 01 10:52:37 launchpad ollama[1636]: llm_load_tensors: offloading 35 repeating layers to GPU
+Dec 01 10:52:37 launchpad ollama[1636]: llm_load_tensors: offloaded 35/41 layers to GPU
+Dec 01 10:52:37 launchpad ollama[1636]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llm_load_tensors:      CUDA0 buffer size =  5956.84 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: ...................................................................................................
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: n_ctx      = 2048
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: n_batch    = 512
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: n_ubatch   = 512
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: freq_scale = 1
+Dec 01 10:52:37 launchpad ollama[1636]: llama_kv_cache_init:  CUDA_Host KV buffer size =   200.00 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llama_kv_cache_init:      CUDA0 KV buffer size =  1400.00 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: graph nodes  = 1286
+Dec 01 10:52:37 launchpad ollama[1636]: llama_new_context_with_model: graph splits = 59
+Dec 01 10:52:38 launchpad ollama[689882]: INFO [main] model loaded | tid="140289154994176" timestamp=1733079158
+Dec 01 10:52:38 launchpad ollama[1636]: time=2024-12-01T10:52:38.205-08:00 level=INFO source=server.go:545 msg="llama runner started in 8.27 seconds"
+Dec 01 10:52:38 launchpad ollama[1636]: [GIN] 2024/12/01 - 10:52:38 | 200 |  8.397271488s |       127.0.0.1 | POST     "/api/chat"
+Dec 01 10:56:18 launchpad ollama[1636]: time=2024-12-01T10:56:18.865-08:00 level=WARN source=types.go:384 msg="invalid option provided" option=""
+Dec 01 10:56:43 launchpad ollama[1636]: [GIN] 2024/12/01 - 10:56:43 | 200 | 25.105856746s |       127.0.0.1 | POST     "/api/chat"
+Dec 01 10:57:46 launchpad ollama[1636]: time=2024-12-01T10:57:46.978-08:00 level=WARN source=types.go:384 msg="invalid option provided" option=""
+Dec 01 11:02:36 launchpad ollama[1636]: [GIN] 2024/12/01 - 11:02:36 | 200 |         4m49s |       127.0.0.1 | POST     "/api/chat"
+Dec 01 11:04:38 launchpad ollama[1636]: time=2024-12-01T11:04:38.157-08:00 level=WARN source=types.go:384 msg="invalid option provided" option=""
+Dec 01 11:05:08 launchpad ollama[1636]: [GIN] 2024/12/01 - 11:05:08 | 200 | 29.954345658s |       127.0.0.1 | POST     "/api/chat"
+Dec 01 18:58:47 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 01 18:58:47 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 01 18:58:47 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 01 18:58:47 launchpad systemd[1]: ollama.service: Consumed 30min 11.253s CPU time, 8.4G memory peak, 0B memory swap peak, received 1.1M IP traffic, sent 1.8M IP traffic.
+-- Boot 8b6024c7dfc34d2e950bf41d40c128b0 --
+Dec 02 07:06:20 launchpad systemd[1]: Started Server for local large language models.
+Dec 02 07:06:20 launchpad ollama[1681]: 2024/12/02 07:06:20 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]"
+Dec 02 07:06:20 launchpad ollama[1681]: time=2024-12-02T07:06:20.241-08:00 level=INFO source=images.go:704 msg="total blobs: 18"
+Dec 02 07:06:20 launchpad ollama[1681]: time=2024-12-02T07:06:20.247-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0"
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
+Dec 02 07:06:20 launchpad ollama[1681]:  - using env:        export GIN_MODE=release
+Dec 02 07:06:20 launchpad ollama[1681]:  - using code:        gin.SetMode(gin.ReleaseMode)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/pull                 --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/generate             --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/chat                 --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/embeddings           --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/create               --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/push                 --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/copy                 --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] DELETE /api/delete               --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/show                 --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] HEAD   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] GET    /api/ps                   --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] POST   /v1/chat/completions      --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] GET    /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] GET    /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] GET    /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] HEAD   /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] HEAD   /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: [GIN-debug] HEAD   /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 02 07:06:20 launchpad ollama[1681]: time=2024-12-02T07:06:20.247-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)"
+Dec 02 07:06:20 launchpad ollama[1681]: time=2024-12-02T07:06:20.248-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama117859918/runners
+Dec 02 07:06:24 launchpad ollama[1681]: time=2024-12-02T07:06:24.349-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 02 07:06:24 launchpad ollama[1681]: time=2024-12-02T07:06:24.437-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.2 GiB"
+Dec 03 10:46:13 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 03 10:46:13 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 03 10:46:13 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 03 10:46:13 launchpad systemd[1]: ollama.service: Consumed 5.926s CPU time, 1.1G memory peak, 0B memory swap peak, no IP traffic.
+-- Boot 3c3403eb7cf049c0ba9e3cae99d71216 --
+Dec 03 10:47:01 launchpad systemd[1]: Started Server for local large language models.
+Dec 03 10:47:01 launchpad ollama[1659]: 2024/12/03 10:47:01 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]"
+Dec 03 10:47:01 launchpad ollama[1659]: time=2024-12-03T10:47:01.471-08:00 level=INFO source=images.go:704 msg="total blobs: 18"
+Dec 03 10:47:01 launchpad ollama[1659]: time=2024-12-03T10:47:01.476-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0"
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
+Dec 03 10:47:01 launchpad ollama[1659]:  - using env:        export GIN_MODE=release
+Dec 03 10:47:01 launchpad ollama[1659]:  - using code:        gin.SetMode(gin.ReleaseMode)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/pull                 --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/generate             --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/chat                 --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/embeddings           --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/create               --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/push                 --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/copy                 --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] DELETE /api/delete               --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/show                 --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] HEAD   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] GET    /api/ps                   --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] POST   /v1/chat/completions      --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] GET    /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] GET    /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] GET    /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] HEAD   /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] HEAD   /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: [GIN-debug] HEAD   /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 03 10:47:01 launchpad ollama[1659]: time=2024-12-03T10:47:01.476-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)"
+Dec 03 10:47:01 launchpad ollama[1659]: time=2024-12-03T10:47:01.476-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama2477072148/runners
+Dec 03 10:47:05 launchpad ollama[1659]: time=2024-12-03T10:47:05.404-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu_avx cpu_avx2 cuda_v12 cpu]"
+Dec 03 10:47:05 launchpad ollama[1659]: time=2024-12-03T10:47:05.493-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.2 GiB"
+Dec 03 13:44:28 launchpad ollama[1659]: [GIN] 2024/12/03 - 13:44:28 | 200 |     719.091µs |       127.0.0.1 | HEAD     "/"
+Dec 03 13:44:28 launchpad ollama[1659]: [GIN] 2024/12/03 - 13:44:28 | 200 |    1.573258ms |       127.0.0.1 | POST     "/api/show"
+Dec 03 13:44:28 launchpad ollama[1659]: [GIN] 2024/12/03 - 13:44:28 | 200 |     340.475µs |       127.0.0.1 | POST     "/api/show"
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.661-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.661-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.662-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 43623"
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.662-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.662-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.662-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 13:44:28 launchpad ollama[108055]: INFO [main] build info | build=0 commit="unknown" tid="140027875844096" timestamp=1733262268
+Dec 03 13:44:28 launchpad ollama[108055]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140027875844096" timestamp=1733262268 total_threads=16
+Dec 03 13:44:28 launchpad ollama[108055]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43623" tid="140027875844096" timestamp=1733262268
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 13:44:28 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 13:44:28 launchpad ollama[1659]: time=2024-12-03T13:44:28.913-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 13:44:28 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 13:44:28 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 13:44:28 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 13:44:28 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 13:44:28 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 13:44:29 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 13:44:33 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 13:44:33 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 13:44:33 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 13:44:33 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 13:44:33 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 13:44:34 launchpad ollama[1659]: .......................................................................................
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 13:44:34 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 13:44:34 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 13:44:34 launchpad ollama[108055]: INFO [main] model loaded | tid="140027875844096" timestamp=1733262274
+Dec 03 13:44:34 launchpad ollama[1659]: time=2024-12-03T13:44:34.683-08:00 level=INFO source=server.go:545 msg="llama runner started in 6.02 seconds"
+Dec 03 13:44:34 launchpad ollama[1659]: [GIN] 2024/12/03 - 13:44:34 | 200 |  6.598303492s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 13:44:59 launchpad ollama[1659]: [GIN] 2024/12/03 - 13:44:59 | 200 |  8.241165407s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.325-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.327-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.327-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 36067"
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.327-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.327-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.328-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 13:59:49 launchpad ollama[116260]: INFO [main] build info | build=0 commit="unknown" tid="140501364985856" timestamp=1733263189
+Dec 03 13:59:49 launchpad ollama[116260]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140501364985856" timestamp=1733263189 total_threads=16
+Dec 03 13:59:49 launchpad ollama[116260]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36067" tid="140501364985856" timestamp=1733263189
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 13:59:49 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 13:59:49 launchpad ollama[1659]: time=2024-12-03T13:59:49.579-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 13:59:49 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 13:59:49 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 13:59:49 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 13:59:49 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 13:59:49 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 13:59:50 launchpad ollama[1659]: .......................................................................................
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 13:59:50 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 13:59:50 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 13:59:50 launchpad ollama[116260]: INFO [main] model loaded | tid="140501364985856" timestamp=1733263190
+Dec 03 13:59:50 launchpad ollama[1659]: time=2024-12-03T13:59:50.332-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.01 seconds"
+Dec 03 13:59:55 launchpad ollama[1659]: [GIN] 2024/12/03 - 13:59:55 | 200 |  6.628493344s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 14:11:53 launchpad ollama[1659]: time=2024-12-03T14:11:53.779-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 14:11:53 launchpad ollama[1659]: time=2024-12-03T14:11:53.779-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 14:11:53 launchpad ollama[1659]: time=2024-12-03T14:11:53.780-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 34129"
+Dec 03 14:11:53 launchpad ollama[1659]: time=2024-12-03T14:11:53.780-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 14:11:53 launchpad ollama[1659]: time=2024-12-03T14:11:53.780-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 14:11:53 launchpad ollama[1659]: time=2024-12-03T14:11:53.780-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 14:11:53 launchpad ollama[123232]: INFO [main] build info | build=0 commit="unknown" tid="139942337175552" timestamp=1733263913
+Dec 03 14:11:53 launchpad ollama[123232]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139942337175552" timestamp=1733263913 total_threads=16
+Dec 03 14:11:53 launchpad ollama[123232]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34129" tid="139942337175552" timestamp=1733263913
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 14:11:53 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 14:11:54 launchpad ollama[1659]: time=2024-12-03T14:11:54.031-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 14:11:54 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 14:11:54 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 14:11:54 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 14:11:54 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: .......................................................................................
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 14:11:54 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 14:11:54 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 14:11:54 launchpad ollama[123232]: INFO [main] model loaded | tid="139942337175552" timestamp=1733263914
+Dec 03 14:11:54 launchpad ollama[1659]: time=2024-12-03T14:11:54.784-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 03 14:12:03 launchpad ollama[1659]: [GIN] 2024/12/03 - 14:12:03 | 200 | 10.801991113s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 14:16:25 launchpad ollama[1659]: [GIN] 2024/12/03 - 14:16:25 | 200 | 10.381062652s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.088-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.088-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.089-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 37643"
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.089-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.089-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.089-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 14:38:18 launchpad ollama[137351]: INFO [main] build info | build=0 commit="unknown" tid="140408150876160" timestamp=1733265498
+Dec 03 14:38:18 launchpad ollama[137351]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140408150876160" timestamp=1733265498 total_threads=16
+Dec 03 14:38:18 launchpad ollama[137351]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37643" tid="140408150876160" timestamp=1733265498
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 14:38:18 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 14:38:18 launchpad ollama[1659]: time=2024-12-03T14:38:18.340-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 14:38:18 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 14:38:18 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 14:38:18 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 14:38:18 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 14:38:18 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 14:38:19 launchpad ollama[1659]: .......................................................................................
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 14:38:19 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 14:38:19 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 14:38:19 launchpad ollama[137351]: INFO [main] model loaded | tid="140408150876160" timestamp=1733265499
+Dec 03 14:38:19 launchpad ollama[1659]: time=2024-12-03T14:38:19.093-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 03 14:38:23 launchpad ollama[1659]: [GIN] 2024/12/03 - 14:38:23 | 200 |  5.665344191s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.631-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.631-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.631-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 45641"
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.632-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.632-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.632-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 14:56:11 launchpad ollama[146881]: INFO [main] build info | build=0 commit="unknown" tid="140122244763648" timestamp=1733266571
+Dec 03 14:56:11 launchpad ollama[146881]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140122244763648" timestamp=1733266571 total_threads=16
+Dec 03 14:56:11 launchpad ollama[146881]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45641" tid="140122244763648" timestamp=1733266571
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 14:56:11 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 14:56:11 launchpad ollama[1659]: time=2024-12-03T14:56:11.883-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 14:56:11 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 14:56:11 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 14:56:11 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 14:56:11 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 14:56:11 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 14:56:12 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 14:56:12 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 14:56:12 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: .......................................................................................
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 14:56:12 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 14:56:12 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 14:56:12 launchpad ollama[146881]: INFO [main] model loaded | tid="140122244763648" timestamp=1733266572
+Dec 03 14:56:12 launchpad ollama[1659]: time=2024-12-03T14:56:12.635-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 03 14:56:16 launchpad ollama[1659]: [GIN] 2024/12/03 - 14:56:16 | 200 |  5.941380448s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 14:58:25 launchpad ollama[1659]: [GIN] 2024/12/03 - 14:58:25 | 200 |  4.217491423s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 14:59:56 launchpad ollama[1659]: [GIN] 2024/12/03 - 14:59:56 | 200 |  2.892466931s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:03:18 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:03:18 | 200 |  4.368670418s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:04:50 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:04:50 | 200 |  3.542498449s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:06:08 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:06:08 | 200 |  3.802530471s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:07:06 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:07:06 | 200 |  2.983658913s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.564-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.564-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.565-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 44625"
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.565-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.565-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.565-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 15:12:53 launchpad ollama[155772]: INFO [main] build info | build=0 commit="unknown" tid="140519775850496" timestamp=1733267573
+Dec 03 15:12:53 launchpad ollama[155772]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140519775850496" timestamp=1733267573 total_threads=16
+Dec 03 15:12:53 launchpad ollama[155772]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44625" tid="140519775850496" timestamp=1733267573
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 15:12:53 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 15:12:53 launchpad ollama[1659]: time=2024-12-03T15:12:53.816-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 15:12:53 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 15:12:53 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 15:12:53 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 15:12:53 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 15:12:53 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 15:12:54 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 15:12:54 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 15:12:54 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: .......................................................................................
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 15:12:54 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 15:12:54 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 15:12:54 launchpad ollama[155772]: INFO [main] model loaded | tid="140519775850496" timestamp=1733267574
+Dec 03 15:12:54 launchpad ollama[1659]: time=2024-12-03T15:12:54.570-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 03 15:12:58 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:12:58 | 200 |  5.654989835s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.194-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.194-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.194-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 44595"
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.194-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.194-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.194-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 15:21:53 launchpad ollama[160851]: INFO [main] build info | build=0 commit="unknown" tid="140205527814144" timestamp=1733268113
+Dec 03 15:21:53 launchpad ollama[160851]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140205527814144" timestamp=1733268113 total_threads=16
+Dec 03 15:21:53 launchpad ollama[160851]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44595" tid="140205527814144" timestamp=1733268113
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 15:21:53 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 15:21:53 launchpad ollama[1659]: time=2024-12-03T15:21:53.445-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 15:21:53 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 15:21:53 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 15:21:53 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 15:21:53 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 15:21:53 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 15:21:54 launchpad ollama[1659]: .......................................................................................
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 15:21:54 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 15:21:54 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 15:21:54 launchpad ollama[160851]: INFO [main] model loaded | tid="140205527814144" timestamp=1733268114
+Dec 03 15:21:54 launchpad ollama[1659]: time=2024-12-03T15:21:54.450-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.26 seconds"
+Dec 03 15:22:01 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:22:01 | 200 |  8.782713142s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:23:21 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:23:21 | 200 |  4.122703789s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:25:15 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:25:15 | 200 |  5.794661393s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:28:42 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:28:42 | 200 |   4.39913915s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:30:46 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:30:46 | 200 |  6.549918347s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:31:46 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:31:46 | 200 |   6.76762143s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:34:34 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:34:34 | 200 |  7.400947696s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:36:00 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:36:00 | 200 |  7.297590238s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:37:59 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:37:59 | 200 |  6.233302043s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:38:47 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:38:47 | 200 |  5.540288603s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:39:30 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:39:30 | 200 |  7.378510802s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 15:42:35 launchpad ollama[1659]: [GIN] 2024/12/03 - 15:42:35 | 200 |  8.335320048s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.007-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.007-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.007-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 38289"
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.007-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.007-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.007-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 03 16:09:47 launchpad ollama[187509]: INFO [main] build info | build=0 commit="unknown" tid="139686128603136" timestamp=1733270987
+Dec 03 16:09:47 launchpad ollama[187509]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139686128603136" timestamp=1733270987 total_threads=16
+Dec 03 16:09:47 launchpad ollama[187509]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38289" tid="139686128603136" timestamp=1733270987
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 03 16:09:47 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 03 16:09:47 launchpad ollama[1659]: time=2024-12-03T16:09:47.258-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 03 16:09:47 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 03 16:09:47 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 03 16:09:47 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 03 16:09:47 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: .......................................................................................
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 03 16:09:47 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 03 16:09:47 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 03 16:09:47 launchpad ollama[187509]: INFO [main] model loaded | tid="139686128603136" timestamp=1733270987
+Dec 03 16:09:48 launchpad ollama[1659]: time=2024-12-03T16:09:48.011-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 03 16:09:57 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:09:57 | 200 | 10.877322797s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:14:11 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:14:11 | 200 |  7.633343909s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:15:03 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:15:03 | 200 |  6.904140221s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:16:40 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:16:40 | 200 |  7.851581347s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:18:28 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:18:28 | 200 |   8.12679671s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:21:31 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:21:31 | 200 |  6.485433807s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:23:21 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:23:21 | 200 |  5.711659211s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:26:13 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:26:13 | 200 |   6.95786815s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:28:17 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:28:17 | 200 |  7.713706455s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:29:52 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:29:52 | 200 |  8.731658639s |       127.0.0.1 | POST     "/api/chat"
+Dec 03 16:30:53 launchpad ollama[1659]: [GIN] 2024/12/03 - 16:30:53 | 200 |   6.95133823s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:17:00 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:17:00 | 200 |       16.01µs |       127.0.0.1 | HEAD     "/"
+Dec 04 09:17:00 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:17:00 | 200 |     358.412µs |       127.0.0.1 | POST     "/api/show"
+Dec 04 09:17:00 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:17:00 | 200 |      327.29µs |       127.0.0.1 | POST     "/api/show"
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.385-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.0 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.386-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.0 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.386-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 42781"
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.386-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.386-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.386-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 04 09:17:01 launchpad ollama[292482]: INFO [main] build info | build=0 commit="unknown" tid="140617243279360" timestamp=1733332621
+Dec 04 09:17:01 launchpad ollama[292482]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140617243279360" timestamp=1733332621 total_threads=16
+Dec 04 09:17:01 launchpad ollama[292482]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42781" tid="140617243279360" timestamp=1733332621
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 04 09:17:01 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 04 09:17:01 launchpad ollama[1659]: time=2024-12-04T09:17:01.638-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 04 09:17:01 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 04 09:17:01 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 04 09:17:01 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 04 09:17:01 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 04 09:17:01 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 04 09:17:02 launchpad ollama[1659]: .......................................................................................
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 04 09:17:02 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 04 09:17:02 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 04 09:17:02 launchpad ollama[292482]: INFO [main] model loaded | tid="140617243279360" timestamp=1733332622
+Dec 04 09:17:02 launchpad ollama[1659]: time=2024-12-04T09:17:02.392-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.01 seconds"
+Dec 04 09:17:02 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:17:02 | 200 |  1.593119474s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:17:25 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:17:25 | 200 |  3.968985607s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:18:29 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:18:29 | 200 |  1.737632457s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:19:26 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:19:26 | 200 |  2.742525699s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:20:37 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:20:37 | 200 |  4.027589705s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:22:08 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:22:08 | 200 |  4.770886706s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:23:16 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:23:16 | 200 |  4.342790103s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:24:19 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:24:19 | 200 |  5.521583489s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:25:48 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:25:48 | 200 |  5.651255584s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:28:28 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:28:28 | 200 |  3.200879754s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:29:17 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:29:17 | 200 |  4.444700133s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:29:45 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:29:45 | 200 |  6.600032671s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:31:35 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:31:35 | 200 |  6.134691517s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:33:12 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:33:12 | 200 |      15.281µs |       127.0.0.1 | HEAD     "/"
+Dec 04 09:33:12 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:33:12 | 200 |     302.976µs |       127.0.0.1 | POST     "/api/show"
+Dec 04 09:33:12 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:33:12 | 200 |     233.124µs |       127.0.0.1 | POST     "/api/show"
+Dec 04 09:33:12 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:33:12 | 200 |     536.018µs |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:34:43 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:34:43 | 200 |  720.970832ms |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:35:29 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:35:29 | 200 |  2.584235748s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:37:22 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:37:22 | 200 |  4.808000486s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:38:52 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:38:52 | 200 |  3.041747597s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:40:35 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:40:35 | 200 |  7.007546328s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:42:21 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:42:21 | 200 |  9.182585003s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:47:21 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:47:21 | 200 |  8.269648658s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:51:02 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:51:02 | 200 |  7.064598792s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 09:54:37 launchpad ollama[1659]: [GIN] 2024/12/04 - 09:54:37 | 200 |  7.891324419s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.178-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.1 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.178-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.1 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.179-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 44941"
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.179-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.179-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.179-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 04 10:12:11 launchpad ollama[322719]: INFO [main] build info | build=0 commit="unknown" tid="140313003261952" timestamp=1733335931
+Dec 04 10:12:11 launchpad ollama[322719]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140313003261952" timestamp=1733335931 total_threads=16
+Dec 04 10:12:11 launchpad ollama[322719]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44941" tid="140313003261952" timestamp=1733335931
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 04 10:12:11 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 04 10:12:11 launchpad ollama[1659]: time=2024-12-04T10:12:11.431-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 04 10:12:11 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 04 10:12:11 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 04 10:12:11 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 04 10:12:11 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 04 10:12:11 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 04 10:12:12 launchpad ollama[1659]: .......................................................................................
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 04 10:12:12 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 04 10:12:12 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 04 10:12:12 launchpad ollama[322719]: INFO [main] model loaded | tid="140313003261952" timestamp=1733335932
+Dec 04 10:12:12 launchpad ollama[1659]: time=2024-12-04T10:12:12.184-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 04 10:12:24 launchpad ollama[1659]: [GIN] 2024/12/04 - 10:12:24 | 200 | 14.291565705s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.616-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.617-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.617-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 40401"
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.617-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.617-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.617-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 04 10:24:09 launchpad ollama[329081]: INFO [main] build info | build=0 commit="unknown" tid="139769284988928" timestamp=1733336649
+Dec 04 10:24:09 launchpad ollama[329081]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139769284988928" timestamp=1733336649 total_threads=16
+Dec 04 10:24:09 launchpad ollama[329081]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40401" tid="139769284988928" timestamp=1733336649
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 04 10:24:09 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 04 10:24:09 launchpad ollama[1659]: time=2024-12-04T10:24:09.868-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 04 10:24:09 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 04 10:24:09 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 04 10:24:09 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 04 10:24:09 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 04 10:24:09 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 04 10:24:10 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 04 10:24:10 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 04 10:24:10 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: .......................................................................................
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 04 10:24:10 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 04 10:24:10 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 04 10:24:10 launchpad ollama[329081]: INFO [main] model loaded | tid="139769284988928" timestamp=1733336650
+Dec 04 10:24:10 launchpad ollama[1659]: time=2024-12-04T10:24:10.621-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 04 10:24:19 launchpad ollama[1659]: [GIN] 2024/12/04 - 10:24:19 | 200 | 10.579786736s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 10:25:48 launchpad ollama[1659]: [GIN] 2024/12/04 - 10:25:48 | 200 |   9.70843195s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.125-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.125-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.126-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 43823"
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.126-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.126-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.126-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 04 10:40:49 launchpad ollama[337927]: INFO [main] build info | build=0 commit="unknown" tid="140683282182144" timestamp=1733337649
+Dec 04 10:40:49 launchpad ollama[337927]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140683282182144" timestamp=1733337649 total_threads=16
+Dec 04 10:40:49 launchpad ollama[337927]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43823" tid="140683282182144" timestamp=1733337649
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 04 10:40:49 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 04 10:40:49 launchpad ollama[1659]: time=2024-12-04T10:40:49.377-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 04 10:40:49 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 04 10:40:49 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 04 10:40:49 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 04 10:40:49 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 04 10:40:49 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 04 10:40:50 launchpad ollama[1659]: .......................................................................................
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 04 10:40:50 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 04 10:40:50 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 04 10:40:50 launchpad ollama[337927]: INFO [main] model loaded | tid="140683282182144" timestamp=1733337650
+Dec 04 10:40:50 launchpad ollama[1659]: time=2024-12-04T10:40:50.129-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 04 10:40:59 launchpad ollama[1659]: [GIN] 2024/12/04 - 10:40:59 | 200 | 11.099586731s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 10:51:09 launchpad ollama[1659]: time=2024-12-04T10:51:09.910-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:51:09 launchpad ollama[1659]: time=2024-12-04T10:51:09.910-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 10:51:09 launchpad ollama[1659]: time=2024-12-04T10:51:09.910-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 46479"
+Dec 04 10:51:09 launchpad ollama[1659]: time=2024-12-04T10:51:09.911-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 04 10:51:09 launchpad ollama[1659]: time=2024-12-04T10:51:09.911-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 04 10:51:09 launchpad ollama[1659]: time=2024-12-04T10:51:09.911-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 04 10:51:09 launchpad ollama[343427]: INFO [main] build info | build=0 commit="unknown" tid="139637822832640" timestamp=1733338269
+Dec 04 10:51:09 launchpad ollama[343427]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139637822832640" timestamp=1733338269 total_threads=16
+Dec 04 10:51:09 launchpad ollama[343427]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46479" tid="139637822832640" timestamp=1733338269
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 04 10:51:09 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 04 10:51:10 launchpad ollama[1659]: time=2024-12-04T10:51:10.162-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 04 10:51:10 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 04 10:51:10 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 04 10:51:10 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 04 10:51:10 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: .......................................................................................
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 04 10:51:10 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 04 10:51:10 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 04 10:51:10 launchpad ollama[343427]: INFO [main] model loaded | tid="139637822832640" timestamp=1733338270
+Dec 04 10:51:10 launchpad ollama[1659]: time=2024-12-04T10:51:10.914-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 04 10:51:18 launchpad ollama[1659]: [GIN] 2024/12/04 - 10:51:18 | 200 |  9.459881992s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 10:54:55 launchpad ollama[1659]: [GIN] 2024/12/04 - 10:54:55 | 200 |  4.748420486s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.219-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.220-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.2 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.220-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2477072148/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 37867"
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.220-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.220-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.220-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 04 11:01:07 launchpad ollama[348732]: INFO [main] build info | build=0 commit="unknown" tid="140276627443712" timestamp=1733338867
+Dec 04 11:01:07 launchpad ollama[348732]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140276627443712" timestamp=1733338867 total_threads=16
+Dec 04 11:01:07 launchpad ollama[348732]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37867" tid="140276627443712" timestamp=1733338867
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - type  f32:   65 tensors
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - type q4_0:  225 tensors
+Dec 04 11:01:07 launchpad ollama[1659]: llama_model_loader: - type q6_K:    1 tensors
+Dec 04 11:01:07 launchpad ollama[1659]: time=2024-12-04T11:01:07.471-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: arch             = llama
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: vocab type       = BPE
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_vocab          = 128256
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_merges         = 280147
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_embd           = 4096
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_head           = 32
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_head_kv        = 8
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_layer          = 32
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_rot            = 128
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_gqa            = 4
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_ff             = 14336
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_expert         = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_expert_used    = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: causal attn      = 1
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: pooling type     = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: rope type        = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: rope scaling     = linear
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: freq_scale_train = 1
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: ssm_d_state      = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: model type       = 8B
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: model ftype      = Q4_0
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: model params     = 8.03 B
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 04 11:01:07 launchpad ollama[1659]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 04 11:01:07 launchpad ollama[1659]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 04 11:01:07 launchpad ollama[1659]: ggml_cuda_init: found 1 CUDA devices:
+Dec 04 11:01:07 launchpad ollama[1659]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 04 11:01:07 launchpad ollama[1659]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 04 11:01:08 launchpad ollama[1659]: .......................................................................................
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: n_ctx      = 2048
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: n_batch    = 512
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: n_ubatch   = 512
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: freq_scale = 1
+Dec 04 11:01:08 launchpad ollama[1659]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: graph nodes  = 1030
+Dec 04 11:01:08 launchpad ollama[1659]: llama_new_context_with_model: graph splits = 2
+Dec 04 11:01:08 launchpad ollama[348732]: INFO [main] model loaded | tid="140276627443712" timestamp=1733338868
+Dec 04 11:01:08 launchpad ollama[1659]: time=2024-12-04T11:01:08.475-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.25 seconds"
+Dec 04 11:01:16 launchpad ollama[1659]: [GIN] 2024/12/04 - 11:01:16 | 200 |  9.608117009s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 11:05:40 launchpad ollama[1659]: [GIN] 2024/12/04 - 11:05:40 | 200 |  6.492649587s |       127.0.0.1 | POST     "/api/chat"
+Dec 04 13:06:01 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 04 13:06:01 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 04 13:06:01 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 04 13:06:01 launchpad systemd[1]: ollama.service: Consumed 6min 51.007s CPU time, received 7.9M IP traffic, sent 8.6M IP traffic.
+-- Boot 8503f579953f4577b468cd97d1b57e4f --
+Dec 04 13:06:44 launchpad systemd[1]: Started Server for local large language models.
+Dec 04 13:06:44 launchpad ollama[1687]: 2024/12/04 13:06:44 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]"
+Dec 04 13:06:44 launchpad ollama[1687]: time=2024-12-04T13:06:44.815-08:00 level=INFO source=images.go:704 msg="total blobs: 18"
+Dec 04 13:06:44 launchpad ollama[1687]: time=2024-12-04T13:06:44.820-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0"
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
+Dec 04 13:06:44 launchpad ollama[1687]:  - using env:        export GIN_MODE=release
+Dec 04 13:06:44 launchpad ollama[1687]:  - using code:        gin.SetMode(gin.ReleaseMode)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/pull                 --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/generate             --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/chat                 --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/embeddings           --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/create               --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/push                 --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/copy                 --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] DELETE /api/delete               --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/show                 --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] HEAD   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] GET    /api/ps                   --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] POST   /v1/chat/completions      --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] GET    /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] GET    /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] GET    /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] HEAD   /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] HEAD   /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: [GIN-debug] HEAD   /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 04 13:06:44 launchpad ollama[1687]: time=2024-12-04T13:06:44.820-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)"
+Dec 04 13:06:44 launchpad ollama[1687]: time=2024-12-04T13:06:44.821-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama2609597904/runners
+Dec 04 13:06:48 launchpad ollama[1687]: time=2024-12-04T13:06:48.842-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu_avx2 cuda_v12 cpu cpu_avx]"
+Dec 04 13:06:48 launchpad ollama[1687]: time=2024-12-04T13:06:48.930-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.2 GiB"
+Dec 05 08:04:21 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:04:21 | 200 |    1.224376ms |       127.0.0.1 | HEAD     "/"
+Dec 05 08:04:21 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:04:21 | 200 |    2.013787ms |       127.0.0.1 | POST     "/api/show"
+Dec 05 08:04:21 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:04:21 | 200 |     247.916µs |       127.0.0.1 | POST     "/api/show"
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.625-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.1 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.625-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.1 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.626-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 43769"
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.626-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.626-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.626-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 08:04:21 launchpad ollama[177289]: INFO [main] build info | build=0 commit="unknown" tid="140035587289088" timestamp=1733414661
+Dec 05 08:04:21 launchpad ollama[177289]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140035587289088" timestamp=1733414661 total_threads=16
+Dec 05 08:04:21 launchpad ollama[177289]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43769" tid="140035587289088" timestamp=1733414661
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 08:04:21 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 08:04:21 launchpad ollama[1687]: time=2024-12-05T08:04:21.877-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 08:04:21 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 08:04:21 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 08:04:21 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 08:04:21 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 08:04:21 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 08:04:26 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 08:04:26 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 08:04:26 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 08:04:26 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 08:04:26 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 08:04:27 launchpad ollama[1687]: .......................................................................................
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 08:04:27 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 08:04:27 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 08:04:27 launchpad ollama[177289]: INFO [main] model loaded | tid="140035587289088" timestamp=1733414667
+Dec 05 08:04:27 launchpad ollama[1687]: time=2024-12-05T08:04:27.394-08:00 level=INFO source=server.go:545 msg="llama runner started in 5.77 seconds"
+Dec 05 08:04:27 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:04:27 | 200 |  6.345604465s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 08:06:18 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:06:18 | 200 | 12.439577585s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 08:10:26 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:10:26 | 200 |  3.813542223s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.217-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.217-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.217-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 36315"
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.217-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.218-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.218-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 08:21:14 launchpad ollama[186406]: INFO [main] build info | build=0 commit="unknown" tid="140490451046400" timestamp=1733415674
+Dec 05 08:21:14 launchpad ollama[186406]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140490451046400" timestamp=1733415674 total_threads=16
+Dec 05 08:21:14 launchpad ollama[186406]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36315" tid="140490451046400" timestamp=1733415674
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 08:21:14 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 08:21:14 launchpad ollama[1687]: time=2024-12-05T08:21:14.469-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 08:21:14 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 08:21:14 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 08:21:14 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 08:21:14 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 08:21:14 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 08:21:15 launchpad ollama[1687]: .......................................................................................
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 08:21:15 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 08:21:15 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 08:21:15 launchpad ollama[186406]: INFO [main] model loaded | tid="140490451046400" timestamp=1733415675
+Dec 05 08:21:15 launchpad ollama[1687]: time=2024-12-05T08:21:15.222-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 05 08:21:27 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:21:27 | 200 | 14.264161715s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.025-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.025-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.025-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 41411"
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.025-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.025-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.026-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 08:59:22 launchpad ollama[206729]: INFO [main] build info | build=0 commit="unknown" tid="139938871152640" timestamp=1733417962
+Dec 05 08:59:22 launchpad ollama[206729]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139938871152640" timestamp=1733417962 total_threads=16
+Dec 05 08:59:22 launchpad ollama[206729]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41411" tid="139938871152640" timestamp=1733417962
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 08:59:22 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 08:59:22 launchpad ollama[1687]: time=2024-12-05T08:59:22.276-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 08:59:22 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 08:59:22 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 08:59:22 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 08:59:22 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: .......................................................................................
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 08:59:22 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 08:59:22 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 08:59:22 launchpad ollama[206729]: INFO [main] model loaded | tid="139938871152640" timestamp=1733417962
+Dec 05 08:59:23 launchpad ollama[1687]: time=2024-12-05T08:59:23.030-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 05 08:59:33 launchpad ollama[1687]: [GIN] 2024/12/05 - 08:59:33 | 200 | 12.304547927s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.563-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.564-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="10.4 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.564-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 38489"
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.565-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.565-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.565-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 09:23:01 launchpad ollama[219369]: INFO [main] build info | build=0 commit="unknown" tid="139747822436352" timestamp=1733419381
+Dec 05 09:23:01 launchpad ollama[219369]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139747822436352" timestamp=1733419381 total_threads=16
+Dec 05 09:23:01 launchpad ollama[219369]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38489" tid="139747822436352" timestamp=1733419381
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 09:23:01 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 09:23:01 launchpad ollama[1687]: time=2024-12-05T09:23:01.815-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 09:23:01 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 09:23:01 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 09:23:01 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 09:23:01 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 09:23:01 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 09:23:02 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 09:23:02 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 09:23:02 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: .......................................................................................
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 09:23:02 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 09:23:02 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 09:23:02 launchpad ollama[219369]: INFO [main] model loaded | tid="139747822436352" timestamp=1733419382
+Dec 05 09:23:02 launchpad ollama[1687]: time=2024-12-05T09:23:02.568-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 05 09:23:14 launchpad ollama[1687]: [GIN] 2024/12/05 - 09:23:14 | 200 | 13.574240459s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.488-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="8.5 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.488-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="8.5 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.489-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 34215"
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.489-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.489-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.489-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 17:25:44 launchpad ollama[1609159]: INFO [main] build info | build=0 commit="unknown" tid="140586951700480" timestamp=1733448344
+Dec 05 17:25:44 launchpad ollama[1609159]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140586951700480" timestamp=1733448344 total_threads=16
+Dec 05 17:25:44 launchpad ollama[1609159]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34215" tid="140586951700480" timestamp=1733448344
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 17:25:44 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 17:25:44 launchpad ollama[1687]: time=2024-12-05T17:25:44.740-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 17:25:44 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 17:25:44 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 17:25:44 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 17:25:44 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 17:25:44 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 17:25:45 launchpad ollama[1687]: .......................................................................................
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 17:25:45 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 17:25:45 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 17:25:45 launchpad ollama[1609159]: INFO [main] model loaded | tid="140586951700480" timestamp=1733448345
+Dec 05 17:25:45 launchpad ollama[1687]: time=2024-12-05T17:25:45.493-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.00 seconds"
+Dec 05 17:25:57 launchpad ollama[1687]: [GIN] 2024/12/05 - 17:25:57 | 200 | 13.478277071s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 17:27:55 launchpad ollama[1687]: [GIN] 2024/12/05 - 17:27:55 | 200 | 10.078664984s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.723-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="8.7 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.723-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="8.7 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.724-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 46521"
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.724-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.724-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.724-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 17:37:11 launchpad ollama[1615259]: INFO [main] build info | build=0 commit="unknown" tid="140303987441664" timestamp=1733449031
+Dec 05 17:37:11 launchpad ollama[1615259]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140303987441664" timestamp=1733449031 total_threads=16
+Dec 05 17:37:11 launchpad ollama[1615259]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46521" tid="140303987441664" timestamp=1733449031
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 17:37:11 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 17:37:11 launchpad ollama[1687]: time=2024-12-05T17:37:11.975-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 17:37:12 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 17:37:12 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 17:37:12 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 17:37:12 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: .......................................................................................
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 17:37:12 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 17:37:12 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 17:37:12 launchpad ollama[1615259]: INFO [main] model loaded | tid="140303987441664" timestamp=1733449032
+Dec 05 17:37:12 launchpad ollama[1687]: time=2024-12-05T17:37:12.978-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.25 seconds"
+Dec 05 17:37:21 launchpad ollama[1687]: [GIN] 2024/12/05 - 17:37:21 | 200 | 10.112585412s |       127.0.0.1 | POST     "/api/chat"
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.243-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="8.7 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.243-08:00 level=INFO source=memory.go:133 msg="offload to gpu" layers.requested=-1 layers.real=33 memory.available="8.7 GiB" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.weights.total="4.1 GiB" memory.weights.repeating="3.7 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.244-08:00 level=INFO source=server.go:320 msg="starting llama server" cmd="/tmp/ollama2609597904/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 45891"
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.244-08:00 level=INFO source=sched.go:338 msg="loaded runners" count=1
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.244-08:00 level=INFO source=server.go:504 msg="waiting for llama runner to start responding"
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.244-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server error"
+Dec 05 17:57:11 launchpad ollama[1625869]: INFO [main] build info | build=0 commit="unknown" tid="139890847899648" timestamp=1733450231
+Dec 05 17:57:11 launchpad ollama[1625869]: INFO [main] system info | n_threads=8 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139890847899648" timestamp=1733450231 total_threads=16
+Dec 05 17:57:11 launchpad ollama[1625869]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45891" tid="139890847899648" timestamp=1733450231
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - type  f32:   65 tensors
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - type q4_0:  225 tensors
+Dec 05 17:57:11 launchpad ollama[1687]: llama_model_loader: - type q6_K:    1 tensors
+Dec 05 17:57:11 launchpad ollama[1687]: time=2024-12-05T17:57:11.495-08:00 level=INFO source=server.go:540 msg="waiting for server to become available" status="llm server loading model"
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_vocab: special tokens definition check successful ( 256/128256 ).
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: arch             = llama
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: vocab type       = BPE
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_vocab          = 128256
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_merges         = 280147
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_embd           = 4096
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_head           = 32
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_head_kv        = 8
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_layer          = 32
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_rot            = 128
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_gqa            = 4
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_ff             = 14336
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_expert         = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_expert_used    = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: causal attn      = 1
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: pooling type     = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: rope type        = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: rope scaling     = linear
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: freq_scale_train = 1
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: n_yarn_orig_ctx  = 8192
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: ssm_d_state      = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: model type       = 8B
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: model ftype      = Q4_0
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: model params     = 8.03 B
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 05 17:57:11 launchpad ollama[1687]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   yes
+Dec 05 17:57:11 launchpad ollama[1687]: ggml_cuda_init: CUDA_USE_TENSOR_CORES: no
+Dec 05 17:57:11 launchpad ollama[1687]: ggml_cuda_init: found 1 CUDA devices:
+Dec 05 17:57:11 launchpad ollama[1687]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_tensors: ggml ctx size =    0.30 MiB
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 05 17:57:11 launchpad ollama[1687]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 05 17:57:12 launchpad ollama[1687]: .......................................................................................
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: n_ctx      = 2048
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: n_batch    = 512
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: n_ubatch   = 512
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: freq_scale = 1
+Dec 05 17:57:12 launchpad ollama[1687]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: graph nodes  = 1030
+Dec 05 17:57:12 launchpad ollama[1687]: llama_new_context_with_model: graph splits = 2
+Dec 05 17:57:12 launchpad ollama[1625869]: INFO [main] model loaded | tid="139890847899648" timestamp=1733450232
+Dec 05 17:57:12 launchpad ollama[1687]: time=2024-12-05T17:57:12.498-08:00 level=INFO source=server.go:545 msg="llama runner started in 1.25 seconds"
+Dec 05 17:57:22 launchpad ollama[1687]: [GIN] 2024/12/05 - 17:57:22 | 200 | 11.885280432s |       127.0.0.1 | POST     "/api/chat"
+Dec 06 08:19:51 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 06 08:19:51 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 06 08:19:51 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 06 08:19:51 launchpad systemd[1]: ollama.service: Consumed 1min 56.861s CPU time, received 1.3M IP traffic, sent 1.8M IP traffic.
+-- Boot b06d70c38d9d47bab758fba931dbb670 --
+Dec 06 08:21:52 launchpad systemd[1]: Started Server for local large language models.
+Dec 06 08:21:53 launchpad ollama[1676]: 2024/12/06 08:21:53 routes.go:1008: INFO server config env="map[OLLAMA_DEBUG:false OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:*] OLLAMA_RUNNERS_DIR: OLLAMA_TMPDIR:]"
+Dec 06 08:21:53 launchpad ollama[1676]: time=2024-12-06T08:21:53.083-08:00 level=INFO source=images.go:704 msg="total blobs: 18"
+Dec 06 08:21:53 launchpad ollama[1676]: time=2024-12-06T08:21:53.087-08:00 level=INFO source=images.go:711 msg="total unused blobs removed: 0"
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] [WARNING] Creating an Engine instance with the Logger and Recovery middleware already attached.
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] [WARNING] Running in "debug" mode. Switch to "release" mode in production.
+Dec 06 08:21:53 launchpad ollama[1676]:  - using env:        export GIN_MODE=release
+Dec 06 08:21:53 launchpad ollama[1676]:  - using code:        gin.SetMode(gin.ReleaseMode)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/pull                 --> github.com/ollama/ollama/server.(*Server).PullModelHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/generate             --> github.com/ollama/ollama/server.(*Server).GenerateHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/chat                 --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/embeddings           --> github.com/ollama/ollama/server.(*Server).EmbeddingsHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/create               --> github.com/ollama/ollama/server.(*Server).CreateModelHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/push                 --> github.com/ollama/ollama/server.(*Server).PushModelHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/copy                 --> github.com/ollama/ollama/server.(*Server).CopyModelHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] DELETE /api/delete               --> github.com/ollama/ollama/server.(*Server).DeleteModelHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/show                 --> github.com/ollama/ollama/server.(*Server).ShowModelHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).CreateBlobHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] HEAD   /api/blobs/:digest        --> github.com/ollama/ollama/server.(*Server).HeadBlobHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] GET    /api/ps                   --> github.com/ollama/ollama/server.(*Server).ProcessHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] POST   /v1/chat/completions      --> github.com/ollama/ollama/server.(*Server).ChatHandler-fm (6 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] GET    /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] GET    /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] GET    /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] HEAD   /                         --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func1 (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] HEAD   /api/tags                 --> github.com/ollama/ollama/server.(*Server).ListModelsHandler-fm (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: [GIN-debug] HEAD   /api/version              --> github.com/ollama/ollama/server.(*Server).GenerateRoutes.func2 (5 handlers)
+Dec 06 08:21:53 launchpad ollama[1676]: time=2024-12-06T08:21:53.088-08:00 level=INFO source=routes.go:1054 msg="Listening on 127.0.0.1:11434 (version 0.1.38)"
+Dec 06 08:21:53 launchpad ollama[1676]: time=2024-12-06T08:21:53.089-08:00 level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama1489032399/runners
+Dec 06 08:21:57 launchpad ollama[1676]: time=2024-12-06T08:21:57.009-08:00 level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 06 08:21:57 launchpad ollama[1676]: time=2024-12-06T08:21:57.095-08:00 level=INFO source=types.go:71 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda compute=8.6 driver=12.4 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.2 GiB"
+Dec 06 09:51:44 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 06 09:51:44 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 06 09:51:44 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 06 09:51:44 launchpad systemd[1]: ollama.service: Consumed 5.379s CPU time, no IP traffic.
+Dec 06 09:51:49 launchpad systemd[1]: Starting Server for local large language models...
+Dec 06 09:51:49 launchpad systemd[1]: Started Server for local large language models.
+Dec 06 09:51:49 launchpad ollama[629744]: 2024/12/06 09:51:49 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 06 09:51:49 launchpad ollama[629744]: time=2024-12-06T09:51:49.729-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 06 09:51:49 launchpad ollama[629744]: time=2024-12-06T09:51:49.729-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 06 09:51:49 launchpad ollama[629744]: time=2024-12-06T09:51:49.729-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 06 09:51:49 launchpad ollama[629744]: time=2024-12-06T09:51:49.729-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2737807184/runners
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.780-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.780-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.780-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.780-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.780-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.783-08:00 level=INFO source=gpu.go:568 msg="unable to load cuda driver library" library=/nix/store/6681919bklbwc4jadjrvq9an1ackx05g-nvidia-x11-560.35.03-6.6.30-rt30/lib/libcuda.so.560.35.03 error="cuda driver library init failure: 804"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.783-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.788-08:00 level=INFO source=gpu.go:347 msg="no compatible GPUs were discovered"
+Dec 06 09:51:52 launchpad ollama[629744]: time=2024-12-06T09:51:52.788-08:00 level=INFO source=types.go:107 msg="inference compute" id=0 library=cpu variant=avx2 compute="" driver=0.0 name="" total="62.6 GiB" available="57.5 GiB"
+Dec 06 10:05:01 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 06 10:05:01 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 06 10:05:01 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 06 10:05:01 launchpad systemd[1]: ollama.service: Consumed 3.363s CPU time, 541M memory peak, 508.1M written to disk.
+-- Boot edbe8cb00f2a4fa7a88446dbef5420da --
+Dec 06 10:06:26 launchpad systemd[1]: Starting Server for local large language models...
+Dec 06 10:06:26 launchpad systemd[1]: Started Server for local large language models.
+Dec 06 10:06:26 launchpad ollama[1572]: 2024/12/06 10:06:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 06 10:06:26 launchpad ollama[1572]: time=2024-12-06T10:06:26.415-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 06 10:06:26 launchpad ollama[1572]: time=2024-12-06T10:06:26.421-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 06 10:06:26 launchpad ollama[1572]: time=2024-12-06T10:06:26.423-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 06 10:06:26 launchpad ollama[1572]: time=2024-12-06T10:06:26.425-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3167090512/runners
+Dec 06 10:06:29 launchpad ollama[1572]: time=2024-12-06T10:06:29.422-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Dec 06 10:06:29 launchpad ollama[1572]: time=2024-12-06T10:06:29.422-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 06 10:06:29 launchpad ollama[1572]: time=2024-12-06T10:06:29.423-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 10:06:29 launchpad ollama[1572]: time=2024-12-06T10:06:29.423-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 10:06:29 launchpad ollama[1572]: time=2024-12-06T10:06:29.423-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 10:06:29 launchpad ollama[1572]: time=2024-12-06T10:06:29.626-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 06 12:45:59 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 06 12:45:59 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 06 12:45:59 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 06 12:45:59 launchpad systemd[1]: ollama.service: Consumed 3.581s CPU time, 791.5M memory peak, 234.8M read from disk, 508.1M written to disk.
+-- Boot b9ef6a62028d4f349af64a0e5416b561 --
+Dec 06 12:46:32 launchpad systemd[1]: Starting Server for local large language models...
+Dec 06 12:46:32 launchpad systemd[1]: Started Server for local large language models.
+Dec 06 12:46:32 launchpad ollama[1573]: 2024/12/06 12:46:32 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 06 12:46:32 launchpad ollama[1573]: time=2024-12-06T12:46:32.719-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 06 12:46:32 launchpad ollama[1573]: time=2024-12-06T12:46:32.723-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 06 12:46:32 launchpad ollama[1573]: time=2024-12-06T12:46:32.724-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 06 12:46:32 launchpad ollama[1573]: time=2024-12-06T12:46:32.727-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1560843028/runners
+Dec 06 12:46:35 launchpad ollama[1573]: time=2024-12-06T12:46:35.709-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 06 12:46:35 launchpad ollama[1573]: time=2024-12-06T12:46:35.710-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 06 12:46:35 launchpad ollama[1573]: time=2024-12-06T12:46:35.710-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 12:46:35 launchpad ollama[1573]: time=2024-12-06T12:46:35.710-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 12:46:35 launchpad ollama[1573]: time=2024-12-06T12:46:35.710-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 06 12:46:35 launchpad ollama[1573]: time=2024-12-06T12:46:35.924-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 06 19:00:22 launchpad systemd[1]: Stopping Server for local large language models...
+-- Boot 65b3437b4fd147ec925a7e6bebfc3127 --
+Dec 07 08:29:25 launchpad systemd[1]: Starting Server for local large language models...
+Dec 07 08:29:25 launchpad systemd[1]: Started Server for local large language models.
+Dec 07 08:29:25 launchpad ollama[1648]: 2024/12/07 08:29:25 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 07 08:29:25 launchpad ollama[1648]: time=2024-12-07T08:29:25.305-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 07 08:29:25 launchpad ollama[1648]: time=2024-12-07T08:29:25.309-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 07 08:29:25 launchpad ollama[1648]: time=2024-12-07T08:29:25.311-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 07 08:29:25 launchpad ollama[1648]: time=2024-12-07T08:29:25.312-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama991934907/runners
+Dec 07 08:29:28 launchpad ollama[1648]: time=2024-12-07T08:29:28.288-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 07 08:29:28 launchpad ollama[1648]: time=2024-12-07T08:29:28.288-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 07 08:29:28 launchpad ollama[1648]: time=2024-12-07T08:29:28.288-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 07 08:29:28 launchpad ollama[1648]: time=2024-12-07T08:29:28.289-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 07 08:29:28 launchpad ollama[1648]: time=2024-12-07T08:29:28.289-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 07 08:29:28 launchpad ollama[1648]: time=2024-12-07T08:29:28.536-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.0 GiB"
+Dec 07 08:30:35 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 07 08:30:35 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 07 08:30:35 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 07 08:30:35 launchpad systemd[1]: ollama.service: Consumed 3.483s CPU time, 790.7M memory peak, 234.7M read from disk, 508.1M written to disk.
+-- Boot ce51437370b3419eabbfd1159e7325bc --
+Dec 07 08:31:07 launchpad systemd[1]: Starting Server for local large language models...
+Dec 07 08:31:07 launchpad systemd[1]: Started Server for local large language models.
+Dec 07 08:31:07 launchpad ollama[1650]: 2024/12/07 08:31:07 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 07 08:31:07 launchpad ollama[1650]: time=2024-12-07T08:31:07.843-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 07 08:31:07 launchpad ollama[1650]: time=2024-12-07T08:31:07.847-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 07 08:31:07 launchpad ollama[1650]: time=2024-12-07T08:31:07.848-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 07 08:31:07 launchpad ollama[1650]: time=2024-12-07T08:31:07.850-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1292331900/runners
+Dec 07 08:31:10 launchpad ollama[1650]: time=2024-12-07T08:31:10.840-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 07 08:31:10 launchpad ollama[1650]: time=2024-12-07T08:31:10.840-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 07 08:31:10 launchpad ollama[1650]: time=2024-12-07T08:31:10.840-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 07 08:31:10 launchpad ollama[1650]: time=2024-12-07T08:31:10.841-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 07 08:31:10 launchpad ollama[1650]: time=2024-12-07T08:31:10.841-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 07 08:31:11 launchpad ollama[1650]: time=2024-12-07T08:31:11.049-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 07 16:12:35 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:12:35 | 200 |     621.541µs |       127.0.0.1 | HEAD     "/"
+Dec 07 16:12:35 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:12:35 | 200 |    6.484323ms |       127.0.0.1 | POST     "/api/show"
+Dec 07 16:12:35 launchpad ollama[1650]: time=2024-12-07T16:12:35.985-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.120-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10494410752 required="9.2 GiB"
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.120-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.1 GiB" free_swap="68.9 GiB"
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.120-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.121-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 38187"
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.121-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.121-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.122-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 07 16:12:36 launchpad ollama[45616]: INFO [main] build info | build=0 commit="unknown" tid="139904028643328" timestamp=1733616756
+Dec 07 16:12:36 launchpad ollama[45616]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139904028643328" timestamp=1733616756 total_threads=16
+Dec 07 16:12:36 launchpad ollama[45616]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38187" tid="139904028643328" timestamp=1733616756
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 07 16:12:36 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 07 16:12:36 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 07 16:12:36 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 07 16:12:36 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 07 16:12:36 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 07 16:12:36 launchpad ollama[1650]: time=2024-12-07T16:12:36.373-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 07 16:12:36 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 07 16:12:43 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 07 16:12:43 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 07 16:12:43 launchpad ollama[1650]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 07 16:12:43 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 07 16:12:43 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 07 16:12:44 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 07 16:12:44 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 07 16:12:44 launchpad ollama[45616]: INFO [main] model loaded | tid="139904028643328" timestamp=1733616764
+Dec 07 16:12:44 launchpad ollama[1650]: time=2024-12-07T16:12:44.901-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.78 seconds"
+Dec 07 16:12:44 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:12:44 | 200 |  8.919257897s |       127.0.0.1 | POST     "/api/generate"
+Dec 07 16:13:06 launchpad ollama[1650]: time=2024-12-07T16:13:06.296-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:13:09 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:13:09 | 200 |  2.879538725s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:14:20 launchpad ollama[1650]: time=2024-12-07T16:14:20.387-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:14:22 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:14:22 | 200 |  2.187351474s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:14:40 launchpad ollama[1650]: time=2024-12-07T16:14:40.007-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:14:43 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:14:43 | 200 |  3.754995679s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:15:53 launchpad ollama[1650]: time=2024-12-07T16:15:53.607-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:16:06 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:16:06 | 200 | 12.567518815s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:16:23 launchpad ollama[1650]: time=2024-12-07T16:16:23.127-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:16:27 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:16:27 | 200 |  4.511815046s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:16:41 launchpad ollama[1650]: time=2024-12-07T16:16:41.915-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:16:58 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:16:58 | 200 | 16.926235698s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:17:26 launchpad ollama[1650]: time=2024-12-07T16:17:26.849-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:17:29 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:17:29 | 200 |  2.208857201s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 16:17:39 launchpad ollama[1650]: time=2024-12-07T16:17:39.180-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 16:17:47 launchpad ollama[1650]: [GIN] 2024/12/07 - 16:17:47 | 200 |  7.896416154s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 18:07:58 launchpad ollama[1650]: [GIN] 2024/12/07 - 18:07:58 | 200 |      14.163µs |       127.0.0.1 | HEAD     "/"
+Dec 07 18:07:58 launchpad ollama[1650]: [GIN] 2024/12/07 - 18:07:58 | 200 |    3.429683ms |       127.0.0.1 | POST     "/api/show"
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.550-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.678-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10334109696 required="9.2 GiB"
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.678-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.5 GiB" free_swap="68.9 GiB"
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.678-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.679-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 44783"
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.679-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.679-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.680-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 07 18:07:58 launchpad ollama[50241]: INFO [main] build info | build=0 commit="unknown" tid="140077486399488" timestamp=1733623678
+Dec 07 18:07:58 launchpad ollama[50241]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140077486399488" timestamp=1733623678 total_threads=16
+Dec 07 18:07:58 launchpad ollama[50241]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44783" tid="140077486399488" timestamp=1733623678
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 07 18:07:58 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 07 18:07:58 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 07 18:07:58 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 07 18:07:58 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 07 18:07:58 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 07 18:07:58 launchpad ollama[1650]: time=2024-12-07T18:07:58.970-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 07 18:07:58 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 07 18:07:59 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 07 18:07:59 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 07 18:07:59 launchpad ollama[50241]: INFO [main] model loaded | tid="140077486399488" timestamp=1733623679
+Dec 07 18:07:59 launchpad ollama[1650]: time=2024-12-07T18:07:59.974-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 07 18:07:59 launchpad ollama[1650]: [GIN] 2024/12/07 - 18:07:59 | 200 |  1.427623749s |       127.0.0.1 | POST     "/api/generate"
+Dec 07 18:09:24 launchpad ollama[1650]: time=2024-12-07T18:09:24.306-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 18:09:33 launchpad ollama[1650]: [GIN] 2024/12/07 - 18:09:33 | 200 |  9.630707279s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 18:10:12 launchpad ollama[1650]: time=2024-12-07T18:10:12.324-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 18:10:19 launchpad ollama[1650]: [GIN] 2024/12/07 - 18:10:19 | 200 |  7.226828194s |       127.0.0.1 | POST     "/api/chat"
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.063-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.183-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10355539968 required="9.2 GiB"
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.183-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.4 GiB" free_swap="68.9 GiB"
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.184-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.185-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 45909"
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.186-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.186-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.186-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 07 18:15:20 launchpad ollama[50643]: INFO [main] build info | build=0 commit="unknown" tid="140596541751296" timestamp=1733624120
+Dec 07 18:15:20 launchpad ollama[50643]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140596541751296" timestamp=1733624120 total_threads=16
+Dec 07 18:15:20 launchpad ollama[50643]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45909" tid="140596541751296" timestamp=1733624120
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 07 18:15:20 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 07 18:15:20 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 07 18:15:20 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 07 18:15:20 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 07 18:15:20 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 07 18:15:20 launchpad ollama[1650]: time=2024-12-07T18:15:20.476-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 07 18:15:20 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 07 18:15:21 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 07 18:15:21 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 07 18:15:21 launchpad ollama[50643]: INFO [main] model loaded | tid="140596541751296" timestamp=1733624121
+Dec 07 18:15:21 launchpad ollama[1650]: time=2024-12-07T18:15:21.479-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 07 18:15:32 launchpad ollama[1650]: [GIN] 2024/12/07 - 18:15:32 | 200 | 12.117083478s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.316-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.463-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.2 GiB" free_swap="68.9 GiB"
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.463-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.464-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 38879"
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.465-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.465-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.465-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 08:43:42 launchpad ollama[57815]: INFO [main] build info | build=0 commit="unknown" tid="140455179128832" timestamp=1733676222
+Dec 08 08:43:42 launchpad ollama[57815]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140455179128832" timestamp=1733676222 total_threads=16
+Dec 08 08:43:42 launchpad ollama[57815]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38879" tid="140455179128832" timestamp=1733676222
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 08:43:42 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 08:43:42 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 08:43:42 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 08:43:42 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 08:43:42 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 08:43:42 launchpad ollama[1650]: time=2024-12-08T08:43:42.752-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 08:43:42 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 08:43:43 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 08:43:43 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 08:43:43 launchpad ollama[57815]: INFO [main] model loaded | tid="140455179128832" timestamp=1733676223
+Dec 08 08:43:43 launchpad ollama[1650]: time=2024-12-08T08:43:43.755-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 08:44:04 launchpad ollama[1650]: [GIN] 2024/12/08 - 08:44:04 | 200 | 21.852098012s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.683-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.832-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.2 GiB" free_swap="68.9 GiB"
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.833-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.834-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 45015"
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.834-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.834-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 08:50:00 launchpad ollama[1650]: time=2024-12-08T08:50:00.834-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 08:50:00 launchpad ollama[61638]: INFO [main] build info | build=0 commit="unknown" tid="140066033774592" timestamp=1733676600
+Dec 08 08:50:00 launchpad ollama[61638]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140066033774592" timestamp=1733676600 total_threads=16
+Dec 08 08:50:00 launchpad ollama[61638]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45015" tid="140066033774592" timestamp=1733676600
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 08:50:00 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 08:50:00 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 08:50:00 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 08:50:00 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 08:50:00 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 08:50:00 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: time=2024-12-08T08:50:01.126-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 08:50:01 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 08:50:01 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 08:50:01 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 08:50:01 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 08:50:01 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 08:50:01 launchpad ollama[61638]: INFO [main] model loaded | tid="140066033774592" timestamp=1733676601
+Dec 08 08:50:02 launchpad ollama[1650]: time=2024-12-08T08:50:02.130-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 08 08:50:31 launchpad ollama[1650]: [GIN] 2024/12/08 - 08:50:31 | 200 | 30.446080939s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.696-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.848-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.2 GiB" free_swap="68.9 GiB"
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.848-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.849-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 42045"
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.849-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.849-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 09:20:57 launchpad ollama[1650]: time=2024-12-08T09:20:57.850-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 09:20:57 launchpad ollama[66994]: INFO [main] build info | build=0 commit="unknown" tid="140673853370368" timestamp=1733678457
+Dec 08 09:20:57 launchpad ollama[66994]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140673853370368" timestamp=1733678457 total_threads=16
+Dec 08 09:20:57 launchpad ollama[66994]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42045" tid="140673853370368" timestamp=1733678457
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 09:20:57 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 09:20:57 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 09:20:57 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 09:20:57 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 09:20:57 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 09:20:57 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: time=2024-12-08T09:20:58.139-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 09:20:58 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 09:20:58 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 09:20:58 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 09:20:58 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 09:20:58 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 09:20:58 launchpad ollama[66994]: INFO [main] model loaded | tid="140673853370368" timestamp=1733678458
+Dec 08 09:20:59 launchpad ollama[1650]: time=2024-12-08T09:20:59.143-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 09:21:09 launchpad ollama[1650]: [GIN] 2024/12/08 - 09:21:09 | 200 | 12.067512562s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.170-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.317-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.1 GiB" free_swap="68.9 GiB"
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.318-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.319-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 35631"
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.319-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.319-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.319-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 09:32:03 launchpad ollama[68579]: INFO [main] build info | build=0 commit="unknown" tid="139818168209408" timestamp=1733679123
+Dec 08 09:32:03 launchpad ollama[68579]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139818168209408" timestamp=1733679123 total_threads=16
+Dec 08 09:32:03 launchpad ollama[68579]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35631" tid="139818168209408" timestamp=1733679123
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 09:32:03 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 09:32:03 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 09:32:03 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 09:32:03 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 09:32:03 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 09:32:03 launchpad ollama[1650]: time=2024-12-08T09:32:03.609-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 09:32:03 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 09:32:04 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 09:32:04 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 09:32:04 launchpad ollama[68579]: INFO [main] model loaded | tid="139818168209408" timestamp=1733679124
+Dec 08 09:32:04 launchpad ollama[1650]: time=2024-12-08T09:32:04.613-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 09:32:13 launchpad ollama[1650]: [GIN] 2024/12/08 - 09:32:13 | 200 |  9.844417818s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 09:36:13 launchpad ollama[1650]: time=2024-12-08T09:36:13.581-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 09:36:20 launchpad ollama[1650]: [GIN] 2024/12/08 - 09:36:20 | 200 |  7.068664686s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 09:39:03 launchpad ollama[1650]: time=2024-12-08T09:39:03.802-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 09:39:16 launchpad ollama[1650]: [GIN] 2024/12/08 - 09:39:16 | 200 | 12.658383022s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 09:40:45 launchpad ollama[1650]: time=2024-12-08T09:40:45.710-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 09:40:56 launchpad ollama[1650]: [GIN] 2024/12/08 - 09:40:56 | 200 | 10.414743911s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 09:45:19 launchpad ollama[1650]: time=2024-12-08T09:45:19.928-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 09:45:34 launchpad ollama[1650]: [GIN] 2024/12/08 - 09:45:34 | 200 | 14.751119658s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.649-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.793-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.1 GiB" free_swap="68.9 GiB"
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.793-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.794-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 34377"
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.794-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.794-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 10:13:47 launchpad ollama[1650]: time=2024-12-08T10:13:47.794-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 10:13:47 launchpad ollama[77261]: INFO [main] build info | build=0 commit="unknown" tid="140252596342784" timestamp=1733681627
+Dec 08 10:13:47 launchpad ollama[77261]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140252596342784" timestamp=1733681627 total_threads=16
+Dec 08 10:13:47 launchpad ollama[77261]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34377" tid="140252596342784" timestamp=1733681627
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 10:13:47 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 10:13:47 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 10:13:47 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 10:13:47 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 10:13:47 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 10:13:47 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: time=2024-12-08T10:13:48.088-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 10:13:48 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 10:13:48 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 10:13:48 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 10:13:48 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 10:13:48 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 10:13:48 launchpad ollama[77261]: INFO [main] model loaded | tid="140252596342784" timestamp=1733681628
+Dec 08 10:13:49 launchpad ollama[1650]: time=2024-12-08T10:13:49.092-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 08 10:14:06 launchpad ollama[1650]: [GIN] 2024/12/08 - 10:14:06 | 200 | 18.756867855s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.234-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.384-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.1 GiB" free_swap="68.9 GiB"
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.384-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.385-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 43061"
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.385-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.385-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.385-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 10:20:00 launchpad ollama[80016]: INFO [main] build info | build=0 commit="unknown" tid="140602753769472" timestamp=1733682000
+Dec 08 10:20:00 launchpad ollama[80016]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140602753769472" timestamp=1733682000 total_threads=16
+Dec 08 10:20:00 launchpad ollama[80016]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43061" tid="140602753769472" timestamp=1733682000
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 10:20:00 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 10:20:00 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 10:20:00 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 10:20:00 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 10:20:00 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 10:20:00 launchpad ollama[1650]: time=2024-12-08T10:20:00.674-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 10:20:00 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 10:20:01 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 10:20:01 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 10:20:01 launchpad ollama[80016]: INFO [main] model loaded | tid="140602753769472" timestamp=1733682001
+Dec 08 10:20:01 launchpad ollama[1650]: time=2024-12-08T10:20:01.678-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 10:20:18 launchpad ollama[1650]: [GIN] 2024/12/08 - 10:20:18 | 200 | 18.129676296s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.598-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.743-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.1 GiB" free_swap="68.9 GiB"
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.743-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.744-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 43375"
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.744-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.744-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 10:26:55 launchpad ollama[1650]: time=2024-12-08T10:26:55.745-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 10:26:55 launchpad ollama[82681]: INFO [main] build info | build=0 commit="unknown" tid="140292201209856" timestamp=1733682415
+Dec 08 10:26:55 launchpad ollama[82681]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140292201209856" timestamp=1733682415 total_threads=16
+Dec 08 10:26:55 launchpad ollama[82681]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43375" tid="140292201209856" timestamp=1733682415
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 10:26:55 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 10:26:55 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 10:26:55 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 10:26:55 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 10:26:55 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 10:26:55 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: time=2024-12-08T10:26:56.033-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 10:26:56 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 10:26:56 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 10:26:56 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 10:26:56 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 10:26:56 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 10:26:56 launchpad ollama[82681]: INFO [main] model loaded | tid="140292201209856" timestamp=1733682416
+Dec 08 10:26:57 launchpad ollama[1650]: time=2024-12-08T10:26:57.036-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 10:27:20 launchpad ollama[1650]: [GIN] 2024/12/08 - 10:27:20 | 200 | 24.559503085s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 10:30:59 launchpad ollama[1650]: time=2024-12-08T10:30:59.211-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 10:31:07 launchpad ollama[1650]: [GIN] 2024/12/08 - 10:31:07 | 200 |  8.074165886s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 10:34:21 launchpad ollama[1650]: time=2024-12-08T10:34:21.741-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 10:34:39 launchpad ollama[1650]: [GIN] 2024/12/08 - 10:34:39 | 200 | 17.802571239s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.456-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.604-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.9 GiB" free_swap="68.9 GiB"
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.605-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.606-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 33281"
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.606-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.606-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.606-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 11:45:54 launchpad ollama[91373]: INFO [main] build info | build=0 commit="unknown" tid="140551050502144" timestamp=1733687154
+Dec 08 11:45:54 launchpad ollama[91373]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140551050502144" timestamp=1733687154 total_threads=16
+Dec 08 11:45:54 launchpad ollama[91373]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33281" tid="140551050502144" timestamp=1733687154
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 11:45:54 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 11:45:54 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 11:45:54 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 11:45:54 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 11:45:54 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 11:45:54 launchpad ollama[1650]: time=2024-12-08T11:45:54.898-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 11:45:54 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 11:45:55 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 11:45:55 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 11:45:55 launchpad ollama[91373]: INFO [main] model loaded | tid="140551050502144" timestamp=1733687155
+Dec 08 11:45:55 launchpad ollama[1650]: time=2024-12-08T11:45:55.902-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 08 11:46:07 launchpad ollama[1650]: [GIN] 2024/12/08 - 11:46:07 | 200 |   13.2517165s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 11:47:18 launchpad ollama[1650]: time=2024-12-08T11:47:18.789-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 11:47:28 launchpad ollama[1650]: [GIN] 2024/12/08 - 11:47:28 | 200 |  9.932454336s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.157-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.305-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.9 GiB" free_swap="68.9 GiB"
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.305-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.306-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 34157"
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.307-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.307-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.307-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 12:00:41 launchpad ollama[94610]: INFO [main] build info | build=0 commit="unknown" tid="139958489665536" timestamp=1733688041
+Dec 08 12:00:41 launchpad ollama[94610]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139958489665536" timestamp=1733688041 total_threads=16
+Dec 08 12:00:41 launchpad ollama[94610]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34157" tid="139958489665536" timestamp=1733688041
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 12:00:41 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 12:00:41 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 12:00:41 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 12:00:41 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 12:00:41 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 12:00:41 launchpad ollama[1650]: time=2024-12-08T12:00:41.595-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 12:00:41 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 12:00:42 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 12:00:42 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 12:00:42 launchpad ollama[94610]: INFO [main] model loaded | tid="139958489665536" timestamp=1733688042
+Dec 08 12:00:42 launchpad ollama[1650]: time=2024-12-08T12:00:42.599-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 12:00:55 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:00:55 | 200 | 14.138865344s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:02:53 launchpad ollama[1650]: time=2024-12-08T12:02:53.349-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:03:02 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:03:02 | 200 |  9.336662279s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:28:07 launchpad ollama[1650]: time=2024-12-08T12:28:07.995-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.146-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.146-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.147-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 33187"
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.148-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.148-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.148-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 12:28:08 launchpad ollama[97750]: INFO [main] build info | build=0 commit="unknown" tid="140674044354560" timestamp=1733689688
+Dec 08 12:28:08 launchpad ollama[97750]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140674044354560" timestamp=1733689688 total_threads=16
+Dec 08 12:28:08 launchpad ollama[97750]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33187" tid="140674044354560" timestamp=1733689688
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 12:28:08 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 12:28:08 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 12:28:08 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 12:28:08 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 12:28:08 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 12:28:08 launchpad ollama[1650]: time=2024-12-08T12:28:08.438-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 12:28:08 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 12:28:09 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 12:28:09 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 12:28:09 launchpad ollama[97750]: INFO [main] model loaded | tid="140674044354560" timestamp=1733689689
+Dec 08 12:28:09 launchpad ollama[1650]: time=2024-12-08T12:28:09.442-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 12:28:31 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:28:31 | 200 | 23.477297113s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:29:42 launchpad ollama[1650]: time=2024-12-08T12:29:42.948-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:29:55 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:29:55 | 200 | 12.239291089s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:33:39 launchpad ollama[1650]: time=2024-12-08T12:33:39.317-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:33:52 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:33:52 | 200 | 13.105667696s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.021-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.176-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.176-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.177-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 40743"
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.177-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.177-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.177-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 12:46:06 launchpad ollama[105224]: INFO [main] build info | build=0 commit="unknown" tid="139692666245120" timestamp=1733690766
+Dec 08 12:46:06 launchpad ollama[105224]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139692666245120" timestamp=1733690766 total_threads=16
+Dec 08 12:46:06 launchpad ollama[105224]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40743" tid="139692666245120" timestamp=1733690766
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 12:46:06 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 12:46:06 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 12:46:06 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 12:46:06 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 12:46:06 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 12:46:06 launchpad ollama[1650]: time=2024-12-08T12:46:06.468-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 12:46:06 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 12:46:07 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 12:46:07 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 12:46:07 launchpad ollama[105224]: INFO [main] model loaded | tid="139692666245120" timestamp=1733690767
+Dec 08 12:46:07 launchpad ollama[1650]: time=2024-12-08T12:46:07.471-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 12:46:20 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:46:20 | 200 | 14.605562135s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.148-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.296-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.296-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.297-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 41469"
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.297-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.297-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.297-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 12:58:13 launchpad ollama[107236]: INFO [main] build info | build=0 commit="unknown" tid="139777021140992" timestamp=1733691493
+Dec 08 12:58:13 launchpad ollama[107236]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139777021140992" timestamp=1733691493 total_threads=16
+Dec 08 12:58:13 launchpad ollama[107236]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41469" tid="139777021140992" timestamp=1733691493
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 12:58:13 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 12:58:13 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 12:58:13 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 12:58:13 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 12:58:13 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 12:58:13 launchpad ollama[1650]: time=2024-12-08T12:58:13.589-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 12:58:13 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 12:58:14 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 12:58:14 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 12:58:14 launchpad ollama[107236]: INFO [main] model loaded | tid="139777021140992" timestamp=1733691494
+Dec 08 12:58:14 launchpad ollama[1650]: time=2024-12-08T12:58:14.593-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 08 12:58:25 launchpad ollama[1650]: [GIN] 2024/12/08 - 12:58:25 | 200 | 12.497340701s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.871-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.991-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.992-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.993-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 36551"
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.993-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.993-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 13:03:26 launchpad ollama[1650]: time=2024-12-08T13:03:26.993-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 13:03:27 launchpad ollama[108920]: INFO [main] build info | build=0 commit="unknown" tid="139995819257856" timestamp=1733691807
+Dec 08 13:03:27 launchpad ollama[108920]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139995819257856" timestamp=1733691807 total_threads=16
+Dec 08 13:03:27 launchpad ollama[108920]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36551" tid="139995819257856" timestamp=1733691807
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 13:03:27 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 13:03:27 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 13:03:27 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 13:03:27 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 13:03:27 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: time=2024-12-08T13:03:27.283-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 13:03:27 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 13:03:27 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 13:03:28 launchpad ollama[108920]: INFO [main] model loaded | tid="139995819257856" timestamp=1733691808
+Dec 08 13:03:28 launchpad ollama[1650]: time=2024-12-08T13:03:28.287-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 13:03:43 launchpad ollama[1650]: [GIN] 2024/12/08 - 13:03:43 | 200 | 16.335701792s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.709-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.859-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.860-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.861-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 45133"
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.861-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.861-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 13:35:58 launchpad ollama[1650]: time=2024-12-08T13:35:58.861-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 13:35:58 launchpad ollama[111252]: INFO [main] build info | build=0 commit="unknown" tid="139995389186048" timestamp=1733693758
+Dec 08 13:35:58 launchpad ollama[111252]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139995389186048" timestamp=1733693758 total_threads=16
+Dec 08 13:35:58 launchpad ollama[111252]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45133" tid="139995389186048" timestamp=1733693758
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 13:35:58 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 13:35:58 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 13:35:58 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 13:35:58 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 13:35:58 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 13:35:58 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: time=2024-12-08T13:35:59.149-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 13:35:59 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 13:35:59 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 13:35:59 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 13:35:59 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 13:35:59 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 13:35:59 launchpad ollama[111252]: INFO [main] model loaded | tid="139995389186048" timestamp=1733693759
+Dec 08 13:36:00 launchpad ollama[1650]: time=2024-12-08T13:36:00.152-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 13:36:13 launchpad ollama[1650]: [GIN] 2024/12/08 - 13:36:13 | 200 | 14.457473019s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 13:37:03 launchpad ollama[1650]: time=2024-12-08T13:37:03.557-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 13:37:07 launchpad ollama[1650]: [GIN] 2024/12/08 - 13:37:07 | 200 |  4.419889077s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.501-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.650-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.650-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.651-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 33725"
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.651-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.651-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.651-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 13:54:17 launchpad ollama[113549]: INFO [main] build info | build=0 commit="unknown" tid="139963495624704" timestamp=1733694857
+Dec 08 13:54:17 launchpad ollama[113549]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139963495624704" timestamp=1733694857 total_threads=16
+Dec 08 13:54:17 launchpad ollama[113549]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33725" tid="139963495624704" timestamp=1733694857
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 13:54:17 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 13:54:17 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 13:54:17 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 13:54:17 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 13:54:17 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 13:54:17 launchpad ollama[1650]: time=2024-12-08T13:54:17.940-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 13:54:17 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 13:54:18 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 13:54:18 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 13:54:18 launchpad ollama[113549]: INFO [main] model loaded | tid="139963495624704" timestamp=1733694858
+Dec 08 13:54:18 launchpad ollama[1650]: time=2024-12-08T13:54:18.944-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 13:54:31 launchpad ollama[1650]: [GIN] 2024/12/08 - 13:54:31 | 200 | 13.913799814s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.824-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.971-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.971-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.972-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 35817"
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.972-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.972-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 14:03:33 launchpad ollama[1650]: time=2024-12-08T14:03:33.973-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 14:03:33 launchpad ollama[115393]: INFO [main] build info | build=0 commit="unknown" tid="140507858587648" timestamp=1733695413
+Dec 08 14:03:33 launchpad ollama[115393]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140507858587648" timestamp=1733695413 total_threads=16
+Dec 08 14:03:33 launchpad ollama[115393]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35817" tid="140507858587648" timestamp=1733695413
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 14:03:33 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 14:03:34 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 14:03:34 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 14:03:34 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 14:03:34 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 14:03:34 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: time=2024-12-08T14:03:34.265-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 14:03:34 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 14:03:34 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 14:03:35 launchpad ollama[115393]: INFO [main] model loaded | tid="140507858587648" timestamp=1733695415
+Dec 08 14:03:35 launchpad ollama[1650]: time=2024-12-08T14:03:35.269-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 08 14:03:42 launchpad ollama[1650]: [GIN] 2024/12/08 - 14:03:42 | 200 |  8.825813962s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 14:12:53 launchpad ollama[1650]: time=2024-12-08T14:12:53.929-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.079-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.079-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.080-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 38301"
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.080-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.080-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.080-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 14:12:54 launchpad ollama[116242]: INFO [main] build info | build=0 commit="unknown" tid="140157593313280" timestamp=1733695974
+Dec 08 14:12:54 launchpad ollama[116242]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140157593313280" timestamp=1733695974 total_threads=16
+Dec 08 14:12:54 launchpad ollama[116242]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38301" tid="140157593313280" timestamp=1733695974
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 14:12:54 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 14:12:54 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 14:12:54 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 14:12:54 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 14:12:54 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 14:12:54 launchpad ollama[1650]: time=2024-12-08T14:12:54.372-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 14:12:54 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 14:12:55 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 14:12:55 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 14:12:55 launchpad ollama[116242]: INFO [main] model loaded | tid="140157593313280" timestamp=1733695975
+Dec 08 14:12:55 launchpad ollama[1650]: time=2024-12-08T14:12:55.376-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 08 14:13:21 launchpad ollama[1650]: [GIN] 2024/12/08 - 14:13:21 | 200 | 27.641394282s |       127.0.0.1 | POST     "/api/chat"
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.306-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.463-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.463-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.464-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 40551"
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.464-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.464-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.465-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 08 14:27:25 launchpad ollama[120773]: INFO [main] build info | build=0 commit="unknown" tid="140232535138304" timestamp=1733696845
+Dec 08 14:27:25 launchpad ollama[120773]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140232535138304" timestamp=1733696845 total_threads=16
+Dec 08 14:27:25 launchpad ollama[120773]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40551" tid="140232535138304" timestamp=1733696845
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 08 14:27:25 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 08 14:27:25 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 08 14:27:25 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 08 14:27:25 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 08 14:27:25 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 08 14:27:25 launchpad ollama[1650]: time=2024-12-08T14:27:25.755-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_tensors: offloaded 40/41 layers to GPU
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 08 14:27:25 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 08 14:27:26 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 08 14:27:26 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 4
+Dec 08 14:27:26 launchpad ollama[120773]: INFO [main] model loaded | tid="140232535138304" timestamp=1733696846
+Dec 08 14:27:26 launchpad ollama[1650]: time=2024-12-08T14:27:26.758-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 08 14:27:43 launchpad ollama[1650]: [GIN] 2024/12/08 - 14:27:43 | 200 | 18.686273917s |       127.0.0.1 | POST     "/api/chat"
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.185-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.329-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.1 GiB" free_swap="68.9 GiB"
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.329-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.330-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 44955"
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.331-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.331-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.331-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 09 15:29:30 launchpad ollama[150766]: INFO [main] build info | build=0 commit="unknown" tid="140070391537664" timestamp=1733786970
+Dec 09 15:29:30 launchpad ollama[150766]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140070391537664" timestamp=1733786970 total_threads=16
+Dec 09 15:29:30 launchpad ollama[150766]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44955" tid="140070391537664" timestamp=1733786970
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 09 15:29:30 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 09 15:29:30 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 09 15:29:30 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 09 15:29:30 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 09 15:29:30 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 09 15:29:30 launchpad ollama[1650]: time=2024-12-09T15:29:30.619-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 09 15:29:30 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 09 15:29:31 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 09 15:29:31 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 09 15:29:31 launchpad ollama[150766]: INFO [main] model loaded | tid="140070391537664" timestamp=1733786971
+Dec 09 15:29:31 launchpad ollama[1650]: time=2024-12-09T15:29:31.623-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 09 15:30:03 launchpad ollama[1650]: [GIN] 2024/12/09 - 15:30:03 | 200 | 33.295449291s |       127.0.0.1 | POST     "/api/chat"
+Dec 09 15:30:07 launchpad ollama[1650]: time=2024-12-09T15:30:07.828-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 09 15:30:26 launchpad ollama[1650]: [GIN] 2024/12/09 - 15:30:26 | 200 | 18.859048206s |       127.0.0.1 | POST     "/api/chat"
+Dec 09 15:30:54 launchpad ollama[1650]: time=2024-12-09T15:30:54.112-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 09 15:31:05 launchpad ollama[1650]: [GIN] 2024/12/09 - 15:31:05 | 200 | 11.163934187s |       127.0.0.1 | POST     "/api/chat"
+Dec 09 15:31:29 launchpad ollama[1650]: time=2024-12-09T15:31:29.056-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 09 15:31:37 launchpad ollama[1650]: [GIN] 2024/12/09 - 15:31:37 | 200 |  8.885529008s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.233-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.379-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.1 GiB" free_swap="68.9 GiB"
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.379-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.380-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 35339"
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.380-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.380-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.380-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 09:00:34 launchpad ollama[174677]: INFO [main] build info | build=0 commit="unknown" tid="140442120286208" timestamp=1733850034
+Dec 10 09:00:34 launchpad ollama[174677]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140442120286208" timestamp=1733850034 total_threads=16
+Dec 10 09:00:34 launchpad ollama[174677]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35339" tid="140442120286208" timestamp=1733850034
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 09:00:34 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 09:00:34 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 09:00:34 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 09:00:34 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 09:00:34 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 09:00:34 launchpad ollama[1650]: time=2024-12-10T09:00:34.674-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 09:00:34 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 09:00:35 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 09:00:35 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 09:00:35 launchpad ollama[174677]: INFO [main] model loaded | tid="140442120286208" timestamp=1733850035
+Dec 10 09:00:35 launchpad ollama[1650]: time=2024-12-10T09:00:35.679-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 09:00:49 launchpad ollama[1650]: [GIN] 2024/12/10 - 09:00:49 | 200 | 15.598580998s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 09:05:28 launchpad ollama[1650]: time=2024-12-10T09:05:28.730-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 09:05:50 launchpad ollama[1650]: [GIN] 2024/12/10 - 09:05:50 | 200 | 21.631372507s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 09:06:57 launchpad ollama[1650]: time=2024-12-10T09:06:57.695-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 09:07:18 launchpad ollama[1650]: [GIN] 2024/12/10 - 09:07:18 | 200 | 20.776754378s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 09:10:40 launchpad ollama[1650]: time=2024-12-10T09:10:40.942-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 09:11:30 launchpad ollama[1650]: [GIN] 2024/12/10 - 09:11:30 | 200 | 50.042077717s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.715-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.866-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.866-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.867-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 36685"
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.867-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.867-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 10:00:18 launchpad ollama[1650]: time=2024-12-10T10:00:18.867-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 10:00:18 launchpad ollama[204106]: INFO [main] build info | build=0 commit="unknown" tid="140199024840704" timestamp=1733853618
+Dec 10 10:00:18 launchpad ollama[204106]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140199024840704" timestamp=1733853618 total_threads=16
+Dec 10 10:00:18 launchpad ollama[204106]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36685" tid="140199024840704" timestamp=1733853618
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 10:00:18 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 10:00:18 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 10:00:18 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 10:00:18 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 10:00:18 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 10:00:18 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: time=2024-12-10T10:00:19.159-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 10:00:19 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 10:00:19 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 10:00:19 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 10:00:19 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 10:00:19 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 10:00:19 launchpad ollama[204106]: INFO [main] model loaded | tid="140199024840704" timestamp=1733853619
+Dec 10 10:00:20 launchpad ollama[1650]: time=2024-12-10T10:00:20.163-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 10:00:41 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:00:41 | 200 |  23.17640591s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:02:48 launchpad ollama[1650]: time=2024-12-10T10:02:48.458-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:03:04 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:03:04 | 200 |  15.57340047s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:03:26 launchpad ollama[1650]: time=2024-12-10T10:03:26.485-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:03:44 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:03:44 | 200 | 18.405971791s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:04:51 launchpad ollama[1650]: time=2024-12-10T10:04:51.604-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:05:09 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:05:09 | 200 | 17.662406949s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:12:40 launchpad ollama[1650]: time=2024-12-10T10:12:40.988-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.139-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.139-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.140-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 42635"
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.140-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.140-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.141-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 10:12:41 launchpad ollama[221878]: INFO [main] build info | build=0 commit="unknown" tid="140154409959424" timestamp=1733854361
+Dec 10 10:12:41 launchpad ollama[221878]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140154409959424" timestamp=1733854361 total_threads=16
+Dec 10 10:12:41 launchpad ollama[221878]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42635" tid="140154409959424" timestamp=1733854361
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 10:12:41 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 10:12:41 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 10:12:41 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 10:12:41 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 10:12:41 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 10:12:41 launchpad ollama[1650]: time=2024-12-10T10:12:41.433-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 10:12:41 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 10:12:42 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 10:12:42 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 10:12:42 launchpad ollama[221878]: INFO [main] model loaded | tid="140154409959424" timestamp=1733854362
+Dec 10 10:12:42 launchpad ollama[1650]: time=2024-12-10T10:12:42.437-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 10:12:49 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:12:49 | 200 |  8.234829757s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:15:39 launchpad ollama[1650]: time=2024-12-10T10:15:39.237-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:15:54 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:15:54 | 200 | 15.748583648s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.120-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.275-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.275-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.276-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 39317"
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.277-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.277-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.277-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 10:23:56 launchpad ollama[227340]: INFO [main] build info | build=0 commit="unknown" tid="139865239724032" timestamp=1733855036
+Dec 10 10:23:56 launchpad ollama[227340]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139865239724032" timestamp=1733855036 total_threads=16
+Dec 10 10:23:56 launchpad ollama[227340]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39317" tid="139865239724032" timestamp=1733855036
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 10:23:56 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 10:23:56 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 10:23:56 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 10:23:56 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 10:23:56 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 10:23:56 launchpad ollama[1650]: time=2024-12-10T10:23:56.569-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 10:23:56 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 10:23:57 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 10:23:57 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 10:23:57 launchpad ollama[227340]: INFO [main] model loaded | tid="139865239724032" timestamp=1733855037
+Dec 10 10:23:57 launchpad ollama[1650]: time=2024-12-10T10:23:57.573-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 10:24:12 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:24:12 | 200 | 16.631156179s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:27:59 launchpad ollama[1650]: time=2024-12-10T10:27:59.553-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:28:12 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:28:12 | 200 | 12.642030634s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:30:13 launchpad ollama[1650]: time=2024-12-10T10:30:13.028-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:30:18 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:30:18 | 200 |  5.024615826s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:31:48 launchpad ollama[1650]: time=2024-12-10T10:31:48.856-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:31:53 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:31:53 | 200 |  5.079846493s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.455-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.602-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.1 GiB" free_swap="68.9 GiB"
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.602-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.603-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 42969"
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.603-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.603-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.603-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 10:56:43 launchpad ollama[235209]: INFO [main] build info | build=0 commit="unknown" tid="140238677946368" timestamp=1733857003
+Dec 10 10:56:43 launchpad ollama[235209]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140238677946368" timestamp=1733857003 total_threads=16
+Dec 10 10:56:43 launchpad ollama[235209]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42969" tid="140238677946368" timestamp=1733857003
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 10:56:43 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 10:56:43 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 10:56:43 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 10:56:43 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 10:56:43 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 10:56:43 launchpad ollama[1650]: time=2024-12-10T10:56:43.891-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 10:56:43 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 10:56:44 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 10:56:44 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 10:56:44 launchpad ollama[235209]: INFO [main] model loaded | tid="140238677946368" timestamp=1733857004
+Dec 10 10:56:44 launchpad ollama[1650]: time=2024-12-10T10:56:44.895-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 10 10:57:14 launchpad ollama[1650]: [GIN] 2024/12/10 - 10:57:14 | 200 | 31.280194103s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 11:01:27 launchpad ollama[1650]: time=2024-12-10T11:01:27.257-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 11:03:13 launchpad ollama[1650]: [GIN] 2024/12/10 - 11:03:13 | 200 |         1m45s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.712-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.855-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.1 GiB" free_swap="68.9 GiB"
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.856-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.857-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 39523"
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.857-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.857-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 11:17:34 launchpad ollama[1650]: time=2024-12-10T11:17:34.857-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 11:17:34 launchpad ollama[272575]: INFO [main] build info | build=0 commit="unknown" tid="140045018062848" timestamp=1733858254
+Dec 10 11:17:34 launchpad ollama[272575]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140045018062848" timestamp=1733858254 total_threads=16
+Dec 10 11:17:34 launchpad ollama[272575]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39523" tid="140045018062848" timestamp=1733858254
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 11:17:34 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 11:17:34 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 11:17:34 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 11:17:34 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 11:17:34 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 11:17:34 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: time=2024-12-10T11:17:35.160-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 11:17:35 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 11:17:35 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 11:17:35 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 11:17:35 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 11:17:35 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 11:17:35 launchpad ollama[272575]: INFO [main] model loaded | tid="140045018062848" timestamp=1733858255
+Dec 10 11:17:36 launchpad ollama[1650]: time=2024-12-10T11:17:36.164-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Dec 10 11:17:42 launchpad ollama[1650]: [GIN] 2024/12/10 - 11:17:42 | 200 |  7.613038765s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.637-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.790-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.1 GiB" free_swap="68.9 GiB"
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.791-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.792-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 39073"
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.792-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.792-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 11:28:04 launchpad ollama[1650]: time=2024-12-10T11:28:04.792-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 11:28:04 launchpad ollama[274621]: INFO [main] build info | build=0 commit="unknown" tid="140715010023424" timestamp=1733858884
+Dec 10 11:28:04 launchpad ollama[274621]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140715010023424" timestamp=1733858884 total_threads=16
+Dec 10 11:28:04 launchpad ollama[274621]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39073" tid="140715010023424" timestamp=1733858884
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 11:28:04 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 11:28:04 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 11:28:04 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 11:28:04 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 11:28:04 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 11:28:04 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: time=2024-12-10T11:28:05.086-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 11:28:05 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 11:28:05 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 11:28:05 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 11:28:05 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 11:28:05 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 11:28:05 launchpad ollama[274621]: INFO [main] model loaded | tid="140715010023424" timestamp=1733858885
+Dec 10 11:28:06 launchpad ollama[1650]: time=2024-12-10T11:28:06.090-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 11:28:14 launchpad ollama[1650]: [GIN] 2024/12/10 - 11:28:14 | 200 |  9.443002105s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.080-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.224-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.224-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.225-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 43849"
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.225-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.225-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.226-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 12:48:11 launchpad ollama[277815]: INFO [main] build info | build=0 commit="unknown" tid="140425342947328" timestamp=1733863691
+Dec 10 12:48:11 launchpad ollama[277815]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140425342947328" timestamp=1733863691 total_threads=16
+Dec 10 12:48:11 launchpad ollama[277815]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43849" tid="140425342947328" timestamp=1733863691
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 12:48:11 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 12:48:11 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 12:48:11 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 12:48:11 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 12:48:11 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 12:48:11 launchpad ollama[1650]: time=2024-12-10T12:48:11.515-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 12:48:11 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 12:48:12 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 12:48:12 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 12:48:12 launchpad ollama[277815]: INFO [main] model loaded | tid="140425342947328" timestamp=1733863692
+Dec 10 12:48:12 launchpad ollama[1650]: time=2024-12-10T12:48:12.520-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 10 12:48:19 launchpad ollama[1650]: [GIN] 2024/12/10 - 12:48:19 | 200 |  8.712496404s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 12:50:22 launchpad ollama[1650]: time=2024-12-10T12:50:22.507-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 12:50:29 launchpad ollama[1650]: [GIN] 2024/12/10 - 12:50:29 | 200 |  7.095418596s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 12:51:39 launchpad ollama[1650]: time=2024-12-10T12:51:39.212-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 12:51:55 launchpad ollama[1650]: [GIN] 2024/12/10 - 12:51:55 | 200 | 16.042730667s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 12:53:02 launchpad ollama[1650]: time=2024-12-10T12:53:02.035-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 12:53:11 launchpad ollama[1650]: [GIN] 2024/12/10 - 12:53:11 | 200 |  9.333209071s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 12:56:23 launchpad ollama[1650]: time=2024-12-10T12:56:23.844-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 12:56:59 launchpad ollama[1650]: [GIN] 2024/12/10 - 12:56:59 | 200 | 35.407892399s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.300-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.454-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.454-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.455-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 34605"
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.455-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.455-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.455-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 13:18:48 launchpad ollama[299175]: INFO [main] build info | build=0 commit="unknown" tid="140344654024704" timestamp=1733865528
+Dec 10 13:18:48 launchpad ollama[299175]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140344654024704" timestamp=1733865528 total_threads=16
+Dec 10 13:18:48 launchpad ollama[299175]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34605" tid="140344654024704" timestamp=1733865528
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 13:18:48 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 13:18:48 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 13:18:48 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 13:18:48 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 13:18:48 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 13:18:48 launchpad ollama[1650]: time=2024-12-10T13:18:48.745-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 13:18:48 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 13:18:49 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 13:18:49 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 13:18:49 launchpad ollama[299175]: INFO [main] model loaded | tid="140344654024704" timestamp=1733865529
+Dec 10 13:18:49 launchpad ollama[1650]: time=2024-12-10T13:18:49.749-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 10 13:19:13 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:19:13 | 200 | 25.362247142s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:20:55 launchpad ollama[1650]: time=2024-12-10T13:20:55.361-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:21:05 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:21:05 | 200 | 10.381297034s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:22:27 launchpad ollama[1650]: time=2024-12-10T13:22:27.545-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:23:02 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:23:02 | 200 | 34.594122762s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:27:11 launchpad ollama[1650]: time=2024-12-10T13:27:11.640-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:27:28 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:27:28 | 200 | 16.383890466s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:27:57 launchpad ollama[1650]: time=2024-12-10T13:27:57.453-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:28:05 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:28:05 | 200 |  8.166409833s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:28:27 launchpad ollama[1650]: time=2024-12-10T13:28:27.718-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:28:59 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:28:59 | 200 | 31.323937518s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:29:17 launchpad ollama[1650]: time=2024-12-10T13:29:17.217-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:29:28 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:29:28 | 200 |  11.66648602s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:30:25 launchpad ollama[1650]: time=2024-12-10T13:30:25.910-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:30:30 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:30:30 | 200 |  4.839795998s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.604-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.757-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.758-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.759-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 33245"
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.759-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.759-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 13:36:36 launchpad ollama[1650]: time=2024-12-10T13:36:36.759-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 13:36:36 launchpad ollama[332423]: INFO [main] build info | build=0 commit="unknown" tid="140223374622720" timestamp=1733866596
+Dec 10 13:36:36 launchpad ollama[332423]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140223374622720" timestamp=1733866596 total_threads=16
+Dec 10 13:36:36 launchpad ollama[332423]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33245" tid="140223374622720" timestamp=1733866596
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 13:36:36 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 13:36:36 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 13:36:36 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 13:36:36 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 13:36:36 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 13:36:36 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: time=2024-12-10T13:36:37.050-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 13:36:37 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 13:36:37 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 13:36:37 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 13:36:37 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 13:36:37 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 13:36:37 launchpad ollama[332423]: INFO [main] model loaded | tid="140223374622720" timestamp=1733866597
+Dec 10 13:36:38 launchpad ollama[1650]: time=2024-12-10T13:36:38.054-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 10 13:36:42 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:36:42 | 200 |   5.49179454s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:37:10 launchpad ollama[1650]: time=2024-12-10T13:37:10.150-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:37:15 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:37:15 | 200 |  5.598442864s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:37:57 launchpad ollama[1650]: time=2024-12-10T13:37:57.357-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:38:05 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:38:05 | 200 |  8.616287505s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:38:47 launchpad ollama[1650]: time=2024-12-10T13:38:47.329-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:38:53 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:38:53 | 200 |  6.271213617s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:42:40 launchpad ollama[1650]: time=2024-12-10T13:42:40.071-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:42:58 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:42:58 | 200 | 18.673486435s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:43:24 launchpad ollama[1650]: time=2024-12-10T13:43:24.174-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:43:32 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:43:32 | 200 |  8.166034494s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:47:54 launchpad ollama[1650]: time=2024-12-10T13:47:54.090-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:48:36 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:48:36 | 200 | 42.472811818s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 13:49:17 launchpad ollama[1650]: time=2024-12-10T13:49:17.497-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 13:49:48 launchpad ollama[1650]: [GIN] 2024/12/10 - 13:49:48 | 200 | 30.754339304s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.774-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.929-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.930-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.931-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 36223"
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.931-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.931-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 14:01:25 launchpad ollama[1650]: time=2024-12-10T14:01:25.931-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 14:01:25 launchpad ollama[362365]: INFO [main] build info | build=0 commit="unknown" tid="140392264495104" timestamp=1733868085
+Dec 10 14:01:25 launchpad ollama[362365]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140392264495104" timestamp=1733868085 total_threads=16
+Dec 10 14:01:25 launchpad ollama[362365]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36223" tid="140392264495104" timestamp=1733868085
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 14:01:25 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 14:01:25 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 14:01:25 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 14:01:25 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 14:01:25 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 14:01:25 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 14:01:26 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: time=2024-12-10T14:01:26.221-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 14:01:26 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 14:01:26 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 14:01:26 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 14:01:26 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 14:01:26 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 14:01:27 launchpad ollama[362365]: INFO [main] model loaded | tid="140392264495104" timestamp=1733868086
+Dec 10 14:01:27 launchpad ollama[1650]: time=2024-12-10T14:01:27.225-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 10 14:01:42 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:01:42 | 200 | 16.917501817s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.044-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.200-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.201-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.202-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 33171"
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.202-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.202-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.202-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 14:08:19 launchpad ollama[365838]: INFO [main] build info | build=0 commit="unknown" tid="140321980641280" timestamp=1733868499
+Dec 10 14:08:19 launchpad ollama[365838]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140321980641280" timestamp=1733868499 total_threads=16
+Dec 10 14:08:19 launchpad ollama[365838]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33171" tid="140321980641280" timestamp=1733868499
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 14:08:19 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 14:08:19 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 14:08:19 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 14:08:19 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 14:08:19 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 14:08:19 launchpad ollama[1650]: time=2024-12-10T14:08:19.497-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 14:08:19 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 14:08:20 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 14:08:20 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 14:08:20 launchpad ollama[365838]: INFO [main] model loaded | tid="140321980641280" timestamp=1733868500
+Dec 10 14:08:20 launchpad ollama[1650]: time=2024-12-10T14:08:20.501-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 14:08:42 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:08:42 | 200 |  23.18110582s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:11:03 launchpad ollama[1650]: time=2024-12-10T14:11:03.977-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:11:17 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:11:17 | 200 | 13.750877455s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:12:44 launchpad ollama[1650]: time=2024-12-10T14:12:44.128-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:12:52 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:12:52 | 200 |  8.702631088s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.143-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.291-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.291-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.292-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 42585"
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.293-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.293-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.293-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 14:23:41 launchpad ollama[376789]: INFO [main] build info | build=0 commit="unknown" tid="139899278577664" timestamp=1733869421
+Dec 10 14:23:41 launchpad ollama[376789]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139899278577664" timestamp=1733869421 total_threads=16
+Dec 10 14:23:41 launchpad ollama[376789]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42585" tid="139899278577664" timestamp=1733869421
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 14:23:41 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 14:23:41 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 14:23:41 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 14:23:41 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 14:23:41 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 14:23:41 launchpad ollama[1650]: time=2024-12-10T14:23:41.591-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 14:23:41 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 14:23:42 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 14:23:42 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 14:23:42 launchpad ollama[376789]: INFO [main] model loaded | tid="139899278577664" timestamp=1733869422
+Dec 10 14:23:42 launchpad ollama[1650]: time=2024-12-10T14:23:42.595-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 14:24:03 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:24:03 | 200 | 22.165994735s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:33:56 launchpad ollama[1650]: time=2024-12-10T14:33:56.981-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.136-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.136-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.138-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 40473"
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.138-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.138-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.138-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 14:33:57 launchpad ollama[381767]: INFO [main] build info | build=0 commit="unknown" tid="140049001594880" timestamp=1733870037
+Dec 10 14:33:57 launchpad ollama[381767]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140049001594880" timestamp=1733870037 total_threads=16
+Dec 10 14:33:57 launchpad ollama[381767]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40473" tid="140049001594880" timestamp=1733870037
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 14:33:57 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 14:33:57 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 14:33:57 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 14:33:57 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 14:33:57 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 14:33:57 launchpad ollama[1650]: time=2024-12-10T14:33:57.429-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 14:33:57 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 14:33:58 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 14:33:58 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 14:33:58 launchpad ollama[381767]: INFO [main] model loaded | tid="140049001594880" timestamp=1733870038
+Dec 10 14:33:58 launchpad ollama[1650]: time=2024-12-10T14:33:58.434-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 14:34:18 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:34:18 | 200 | 21.837078215s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:38:39 launchpad ollama[1650]: time=2024-12-10T14:38:39.815-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:38:56 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:38:56 | 200 | 16.692629007s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.450-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.607-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.608-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.609-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 33501"
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.609-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.609-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.610-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 14:47:32 launchpad ollama[390600]: INFO [main] build info | build=0 commit="unknown" tid="140705561300992" timestamp=1733870852
+Dec 10 14:47:32 launchpad ollama[390600]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140705561300992" timestamp=1733870852 total_threads=16
+Dec 10 14:47:32 launchpad ollama[390600]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33501" tid="140705561300992" timestamp=1733870852
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 14:47:32 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 14:47:32 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 14:47:32 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 14:47:32 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 14:47:32 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 14:47:32 launchpad ollama[1650]: time=2024-12-10T14:47:32.906-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 14:47:32 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 14:47:33 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 14:47:33 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 14:47:33 launchpad ollama[390600]: INFO [main] model loaded | tid="140705561300992" timestamp=1733870853
+Dec 10 14:47:33 launchpad ollama[1650]: time=2024-12-10T14:47:33.909-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 14:47:45 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:47:45 | 200 | 13.391795289s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:50:18 launchpad ollama[1650]: time=2024-12-10T14:50:18.729-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:50:43 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:50:43 | 200 | 24.817309058s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:51:57 launchpad ollama[1650]: time=2024-12-10T14:51:57.396-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:52:13 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:52:13 | 200 | 16.206639651s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:54:44 launchpad ollama[1650]: time=2024-12-10T14:54:44.484-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:55:02 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:55:02 | 200 | 17.705412297s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:55:40 launchpad ollama[1650]: time=2024-12-10T14:55:40.893-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:56:07 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:56:07 | 200 | 26.230730716s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 14:57:00 launchpad ollama[1650]: time=2024-12-10T14:57:00.005-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 14:57:18 launchpad ollama[1650]: [GIN] 2024/12/10 - 14:57:18 | 200 | 18.065191785s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.479-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.630-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.631-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.632-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 34683"
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.632-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.632-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.632-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 15:16:13 launchpad ollama[417545]: INFO [main] build info | build=0 commit="unknown" tid="140274761580544" timestamp=1733872573
+Dec 10 15:16:13 launchpad ollama[417545]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140274761580544" timestamp=1733872573 total_threads=16
+Dec 10 15:16:13 launchpad ollama[417545]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34683" tid="140274761580544" timestamp=1733872573
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 10 15:16:13 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 10 15:16:13 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 15:16:13 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 15:16:13 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 15:16:13 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 10 15:16:13 launchpad ollama[1650]: time=2024-12-10T15:16:13.925-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 10 15:16:13 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 15:16:14 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 10 15:16:14 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 10 15:16:14 launchpad ollama[417545]: INFO [main] model loaded | tid="140274761580544" timestamp=1733872574
+Dec 10 15:16:14 launchpad ollama[1650]: time=2024-12-10T15:16:14.928-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 10 15:16:28 launchpad ollama[1650]: [GIN] 2024/12/10 - 15:16:28 | 200 | 14.648835449s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 16:57:39 launchpad ollama[1650]: [GIN] 2024/12/10 - 16:57:39 | 404 |     128.842µs |       127.0.0.1 | POST     "/api/generate"
+Dec 10 16:57:55 launchpad ollama[1650]: [GIN] 2024/12/10 - 16:57:55 | 200 |      15.651µs |       127.0.0.1 | HEAD     "/"
+Dec 10 16:57:55 launchpad ollama[1650]: [GIN] 2024/12/10 - 16:57:55 | 200 |   18.644249ms |       127.0.0.1 | POST     "/api/show"
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.464-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9508159488 required="6.2 GiB"
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.464-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.464-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.465-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33643"
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.465-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.465-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.466-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 16:57:55 launchpad ollama[421866]: INFO [main] build info | build=0 commit="unknown" tid="140253495021568" timestamp=1733878675
+Dec 10 16:57:55 launchpad ollama[421866]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140253495021568" timestamp=1733878675 total_threads=16
+Dec 10 16:57:55 launchpad ollama[421866]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33643" tid="140253495021568" timestamp=1733878675
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 16:57:55 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 16:57:55 launchpad ollama[1650]: time=2024-12-10T16:57:55.717-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 16:57:55 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 16:57:55 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 16:57:55 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 16:57:55 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 16:57:55 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 16:58:00 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 16:58:00 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 16:58:00 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 16:58:00 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 16:58:00 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 16:58:01 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 16:58:01 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 16:58:01 launchpad ollama[421866]: INFO [main] model loaded | tid="140253495021568" timestamp=1733878681
+Dec 10 16:58:01 launchpad ollama[1650]: time=2024-12-10T16:58:01.488-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.02 seconds"
+Dec 10 16:58:01 launchpad ollama[1650]: [GIN] 2024/12/10 - 16:58:01 | 200 |  6.198144851s |       127.0.0.1 | POST     "/api/generate"
+Dec 10 16:58:14 launchpad ollama[1650]: [GIN] 2024/12/10 - 16:58:14 | 200 |  786.154967ms |       127.0.0.1 | POST     "/api/chat"
+Dec 10 16:58:47 launchpad ollama[1650]: [GIN] 2024/12/10 - 16:58:47 | 200 |  1.297004736s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 17:00:38 launchpad ollama[1650]: [GIN] 2024/12/10 - 17:00:38 | 200 |  3.489483373s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.056-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9612296192 required="6.2 GiB"
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.056-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.057-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.058-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34813"
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.058-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.058-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.058-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 17:09:11 launchpad ollama[421932]: INFO [main] build info | build=0 commit="unknown" tid="140077366861824" timestamp=1733879351
+Dec 10 17:09:11 launchpad ollama[421932]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140077366861824" timestamp=1733879351 total_threads=16
+Dec 10 17:09:11 launchpad ollama[421932]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34813" tid="140077366861824" timestamp=1733879351
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 17:09:11 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 17:09:11 launchpad ollama[1650]: time=2024-12-10T17:09:11.309-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 17:09:11 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 17:09:11 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 17:09:11 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 17:09:11 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 17:09:11 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 17:09:11 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 17:09:12 launchpad ollama[421932]: INFO [main] model loaded | tid="140077366861824" timestamp=1733879352
+Dec 10 17:09:12 launchpad ollama[1650]: time=2024-12-10T17:09:12.062-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 10 17:09:20 launchpad ollama[1650]: [GIN] 2024/12/10 - 17:09:20 | 200 |  9.888046721s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.764-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9612230656 required="6.2 GiB"
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.765-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.765-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.766-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34009"
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.767-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.767-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 17:19:08 launchpad ollama[1650]: time=2024-12-10T17:19:08.767-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 17:19:08 launchpad ollama[421988]: INFO [main] build info | build=0 commit="unknown" tid="140584350109696" timestamp=1733879948
+Dec 10 17:19:08 launchpad ollama[421988]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140584350109696" timestamp=1733879948 total_threads=16
+Dec 10 17:19:08 launchpad ollama[421988]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34009" tid="140584350109696" timestamp=1733879948
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 17:19:08 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 17:19:09 launchpad ollama[1650]: time=2024-12-10T17:19:09.018-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 17:19:09 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 17:19:09 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 17:19:09 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 17:19:09 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 17:19:09 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 17:19:09 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 17:19:09 launchpad ollama[421988]: INFO [main] model loaded | tid="140584350109696" timestamp=1733879949
+Dec 10 17:19:10 launchpad ollama[1650]: time=2024-12-10T17:19:10.021-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Dec 10 17:19:18 launchpad ollama[1650]: [GIN] 2024/12/10 - 17:19:18 | 200 | 10.018811403s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.305-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9612296192 required="6.2 GiB"
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.305-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.305-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.306-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35857"
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.307-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.307-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.307-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 17:30:50 launchpad ollama[422045]: INFO [main] build info | build=0 commit="unknown" tid="140647704125440" timestamp=1733880650
+Dec 10 17:30:50 launchpad ollama[422045]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140647704125440" timestamp=1733880650 total_threads=16
+Dec 10 17:30:50 launchpad ollama[422045]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35857" tid="140647704125440" timestamp=1733880650
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 17:30:50 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 17:30:50 launchpad ollama[1650]: time=2024-12-10T17:30:50.558-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 17:30:50 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 17:30:50 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 17:30:50 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 17:30:50 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 17:30:50 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 17:30:51 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 17:30:51 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 17:30:51 launchpad ollama[422045]: INFO [main] model loaded | tid="140647704125440" timestamp=1733880651
+Dec 10 17:30:51 launchpad ollama[1650]: time=2024-12-10T17:30:51.561-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Dec 10 17:31:01 launchpad ollama[1650]: [GIN] 2024/12/10 - 17:31:01 | 200 | 11.723588665s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.989-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9612296192 required="6.2 GiB"
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.989-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.989-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.990-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41169"
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.990-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.990-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 17:49:06 launchpad ollama[1650]: time=2024-12-10T17:49:06.991-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 17:49:07 launchpad ollama[422099]: INFO [main] build info | build=0 commit="unknown" tid="140321830719488" timestamp=1733881747
+Dec 10 17:49:07 launchpad ollama[422099]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140321830719488" timestamp=1733881747 total_threads=16
+Dec 10 17:49:07 launchpad ollama[422099]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41169" tid="140321830719488" timestamp=1733881747
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 17:49:07 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 17:49:07 launchpad ollama[1650]: time=2024-12-10T17:49:07.241-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 17:49:07 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 17:49:07 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 17:49:07 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 17:49:07 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 17:49:07 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 17:49:07 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 17:49:08 launchpad ollama[422099]: INFO [main] model loaded | tid="140321830719488" timestamp=1733881748
+Dec 10 17:49:08 launchpad ollama[1650]: time=2024-12-10T17:49:08.246-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Dec 10 17:49:18 launchpad ollama[1650]: [GIN] 2024/12/10 - 17:49:18 | 200 | 11.847662115s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.829-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9612296192 required="6.2 GiB"
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.829-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.830-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.831-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34709"
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.831-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.831-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 17:56:01 launchpad ollama[1650]: time=2024-12-10T17:56:01.831-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 17:56:01 launchpad ollama[422136]: INFO [main] build info | build=0 commit="unknown" tid="140308399419392" timestamp=1733882161
+Dec 10 17:56:01 launchpad ollama[422136]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140308399419392" timestamp=1733882161 total_threads=16
+Dec 10 17:56:01 launchpad ollama[422136]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34709" tid="140308399419392" timestamp=1733882161
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 17:56:01 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 17:56:02 launchpad ollama[1650]: time=2024-12-10T17:56:02.082-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 17:56:02 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 17:56:02 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 17:56:02 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 17:56:02 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 17:56:02 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 17:56:02 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 17:56:02 launchpad ollama[422136]: INFO [main] model loaded | tid="140308399419392" timestamp=1733882162
+Dec 10 17:56:02 launchpad ollama[1650]: time=2024-12-10T17:56:02.836-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 10 17:56:09 launchpad ollama[1650]: [GIN] 2024/12/10 - 17:56:09 | 200 |  7.935606954s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.125-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9612296192 required="6.2 GiB"
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.125-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.125-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.126-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44159"
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.127-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.127-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.127-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 18:01:36 launchpad ollama[422185]: INFO [main] build info | build=0 commit="unknown" tid="139812718727168" timestamp=1733882496
+Dec 10 18:01:36 launchpad ollama[422185]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139812718727168" timestamp=1733882496 total_threads=16
+Dec 10 18:01:36 launchpad ollama[422185]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44159" tid="139812718727168" timestamp=1733882496
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 18:01:36 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 18:01:36 launchpad ollama[1650]: time=2024-12-10T18:01:36.378-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 18:01:36 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 18:01:36 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 18:01:36 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 18:01:36 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 18:01:36 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 18:01:37 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 18:01:37 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 18:01:37 launchpad ollama[422185]: INFO [main] model loaded | tid="139812718727168" timestamp=1733882497
+Dec 10 18:01:37 launchpad ollama[1650]: time=2024-12-10T18:01:37.381-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Dec 10 18:01:44 launchpad ollama[1650]: [GIN] 2024/12/10 - 18:01:44 | 200 |  8.499003961s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.534-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9499705344 required="6.2 GiB"
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.534-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.534-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.535-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44585"
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.535-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.535-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.535-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 10 18:11:13 launchpad ollama[422251]: INFO [main] build info | build=0 commit="unknown" tid="140556930772992" timestamp=1733883073
+Dec 10 18:11:13 launchpad ollama[422251]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140556930772992" timestamp=1733883073 total_threads=16
+Dec 10 18:11:13 launchpad ollama[422251]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44585" tid="140556930772992" timestamp=1733883073
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 10 18:11:13 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 10 18:11:13 launchpad ollama[1650]: time=2024-12-10T18:11:13.786-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 10 18:11:13 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 10 18:11:13 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 10 18:11:13 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 10 18:11:13 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 10 18:11:13 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 10 18:11:14 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 10 18:11:14 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 10 18:11:14 launchpad ollama[422251]: INFO [main] model loaded | tid="140556930772992" timestamp=1733883074
+Dec 10 18:11:14 launchpad ollama[1650]: time=2024-12-10T18:11:14.540-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 10 18:11:22 launchpad ollama[1650]: [GIN] 2024/12/10 - 18:11:22 | 200 |  9.319415241s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 18:13:14 launchpad ollama[1650]: [GIN] 2024/12/10 - 18:13:14 | 200 |  3.260898825s |       127.0.0.1 | POST     "/api/chat"
+Dec 10 18:15:42 launchpad ollama[1650]: [GIN] 2024/12/10 - 18:15:42 | 200 |  1.883067966s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.441-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9483714560 required="6.2 GiB"
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.441-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.1 GiB" free_swap="68.9 GiB"
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.442-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.443-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37249"
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.443-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.443-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.443-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 11 07:43:26 launchpad ollama[423071]: INFO [main] build info | build=0 commit="unknown" tid="140625205911552" timestamp=1733931806
+Dec 11 07:43:26 launchpad ollama[423071]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140625205911552" timestamp=1733931806 total_threads=16
+Dec 11 07:43:26 launchpad ollama[423071]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37249" tid="140625205911552" timestamp=1733931806
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 11 07:43:26 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 11 07:43:26 launchpad ollama[1650]: time=2024-12-11T07:43:26.694-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 11 07:43:26 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 11 07:43:26 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 11 07:43:26 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 11 07:43:26 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 11 07:43:26 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 11 07:43:27 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 11 07:43:27 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 11 07:43:27 launchpad ollama[423071]: INFO [main] model loaded | tid="140625205911552" timestamp=1733931807
+Dec 11 07:43:27 launchpad ollama[1650]: time=2024-12-11T07:43:27.447-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 11 07:43:34 launchpad ollama[1650]: [GIN] 2024/12/11 - 07:43:34 | 200 |   8.43094446s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 07:45:23 launchpad ollama[1650]: [GIN] 2024/12/11 - 07:45:23 | 200 |  7.365034468s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 07:46:07 launchpad ollama[1650]: [GIN] 2024/12/11 - 07:46:07 | 200 |  4.253881663s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 07:48:08 launchpad ollama[1650]: [GIN] 2024/12/11 - 07:48:08 | 200 |  2.783468333s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 13:43:30 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:43:30 | 200 |       15.84µs |       127.0.0.1 | HEAD     "/"
+Dec 11 13:43:30 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:43:30 | 200 |    3.396496ms |       127.0.0.1 | POST     "/api/show"
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.645-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.787-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.6 GiB" free_swap="68.9 GiB"
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.788-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.789-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 35235"
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.789-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.789-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 11 13:43:30 launchpad ollama[1650]: time=2024-12-11T13:43:30.790-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 11 13:43:30 launchpad ollama[431032]: INFO [main] build info | build=0 commit="unknown" tid="140148342874112" timestamp=1733953410
+Dec 11 13:43:30 launchpad ollama[431032]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140148342874112" timestamp=1733953410 total_threads=16
+Dec 11 13:43:30 launchpad ollama[431032]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35235" tid="140148342874112" timestamp=1733953410
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 11 13:43:30 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 11 13:43:30 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 11 13:43:30 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 11 13:43:30 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 11 13:43:30 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 11 13:43:30 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: time=2024-12-11T13:43:31.082-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 11 13:43:31 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 11 13:43:31 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 11 13:43:31 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 11 13:43:31 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 11 13:43:31 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 11 13:43:31 launchpad ollama[431032]: INFO [main] model loaded | tid="140148342874112" timestamp=1733953411
+Dec 11 13:43:32 launchpad ollama[1650]: time=2024-12-11T13:43:32.086-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 11 13:43:32 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:43:32 | 200 |  1.444791055s |       127.0.0.1 | POST     "/api/generate"
+Dec 11 13:46:44 launchpad ollama[1650]: time=2024-12-11T13:46:44.674-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 13:46:51 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:46:51 | 200 |  6.888345196s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.248-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.404-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.6 GiB" free_swap="68.9 GiB"
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.404-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.405-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 42881"
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.405-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.405-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.405-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 11 13:53:59 launchpad ollama[433241]: INFO [main] build info | build=0 commit="unknown" tid="139892779831296" timestamp=1733954039
+Dec 11 13:53:59 launchpad ollama[433241]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139892779831296" timestamp=1733954039 total_threads=16
+Dec 11 13:53:59 launchpad ollama[433241]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42881" tid="139892779831296" timestamp=1733954039
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 11 13:53:59 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 11 13:53:59 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 11 13:53:59 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 11 13:53:59 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 11 13:53:59 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 11 13:53:59 launchpad ollama[1650]: time=2024-12-11T13:53:59.694-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 11 13:53:59 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 11 13:54:00 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 11 13:54:00 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 11 13:54:00 launchpad ollama[433241]: INFO [main] model loaded | tid="139892779831296" timestamp=1733954040
+Dec 11 13:54:00 launchpad ollama[1650]: time=2024-12-11T13:54:00.698-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 11 13:54:05 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:54:05 | 200 |  6.126193598s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 13:55:26 launchpad ollama[1650]: time=2024-12-11T13:55:26.775-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 13:55:54 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:55:54 | 200 | 27.614988181s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 13:57:52 launchpad ollama[1650]: time=2024-12-11T13:57:52.401-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 13:58:12 launchpad ollama[1650]: [GIN] 2024/12/11 - 13:58:12 | 200 | 20.397791898s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 14:00:28 launchpad ollama[1650]: time=2024-12-11T14:00:28.893-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 14:00:34 launchpad ollama[1650]: [GIN] 2024/12/11 - 14:00:34 | 200 |  5.768630267s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 14:01:48 launchpad ollama[1650]: time=2024-12-11T14:01:48.180-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 14:01:52 launchpad ollama[1650]: [GIN] 2024/12/11 - 14:01:52 | 200 |  4.009592784s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.620-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.785-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.6 GiB" free_swap="68.9 GiB"
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.786-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.787-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 36761"
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.787-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.787-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 11 14:10:11 launchpad ollama[1650]: time=2024-12-11T14:10:11.787-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 11 14:10:11 launchpad ollama[451071]: INFO [main] build info | build=0 commit="unknown" tid="140105837682688" timestamp=1733955011
+Dec 11 14:10:11 launchpad ollama[451071]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140105837682688" timestamp=1733955011 total_threads=16
+Dec 11 14:10:11 launchpad ollama[451071]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36761" tid="140105837682688" timestamp=1733955011
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 11 14:10:11 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 11 14:10:11 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 11 14:10:11 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 11 14:10:11 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 11 14:10:11 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 11 14:10:11 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: time=2024-12-11T14:10:12.090-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 11 14:10:12 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 11 14:10:12 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 11 14:10:12 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 11 14:10:12 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 11 14:10:12 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 11 14:10:12 launchpad ollama[451071]: INFO [main] model loaded | tid="140105837682688" timestamp=1733955012
+Dec 11 14:10:13 launchpad ollama[1650]: time=2024-12-11T14:10:13.093-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Dec 11 14:10:27 launchpad ollama[1650]: [GIN] 2024/12/11 - 14:10:27 | 200 | 15.651452873s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 14:11:53 launchpad ollama[1650]: time=2024-12-11T14:11:53.390-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 14:12:01 launchpad ollama[1650]: [GIN] 2024/12/11 - 14:12:01 | 200 |  8.580213294s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.486-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.636-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.5 GiB" free_swap="68.9 GiB"
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.637-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.638-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 45319"
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.638-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.638-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.638-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 11 14:29:07 launchpad ollama[455431]: INFO [main] build info | build=0 commit="unknown" tid="140361982791680" timestamp=1733956147
+Dec 11 14:29:07 launchpad ollama[455431]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140361982791680" timestamp=1733956147 total_threads=16
+Dec 11 14:29:07 launchpad ollama[455431]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45319" tid="140361982791680" timestamp=1733956147
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 11 14:29:07 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 11 14:29:07 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 11 14:29:07 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 11 14:29:07 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 11 14:29:07 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 11 14:29:07 launchpad ollama[1650]: time=2024-12-11T14:29:07.938-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 11 14:29:07 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 11 14:29:08 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 11 14:29:08 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 11 14:29:08 launchpad ollama[455431]: INFO [main] model loaded | tid="140361982791680" timestamp=1733956148
+Dec 11 14:29:08 launchpad ollama[1650]: time=2024-12-11T14:29:08.942-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 11 14:30:20 launchpad ollama[1650]: [GIN] 2024/12/11 - 14:30:20 | 200 |         1m13s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.445-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.603-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.5 GiB" free_swap="68.9 GiB"
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.604-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.605-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 43521"
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.605-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.605-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.605-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 11 15:41:49 launchpad ollama[474523]: INFO [main] build info | build=0 commit="unknown" tid="140609861214208" timestamp=1733960509
+Dec 11 15:41:49 launchpad ollama[474523]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140609861214208" timestamp=1733960509 total_threads=16
+Dec 11 15:41:49 launchpad ollama[474523]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43521" tid="140609861214208" timestamp=1733960509
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 11 15:41:49 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 11 15:41:49 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 11 15:41:49 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 11 15:41:49 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 11 15:41:49 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 11 15:41:49 launchpad ollama[1650]: time=2024-12-11T15:41:49.903-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 11 15:41:49 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 11 15:41:50 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 11 15:41:50 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 11 15:41:50 launchpad ollama[474523]: INFO [main] model loaded | tid="140609861214208" timestamp=1733960510
+Dec 11 15:41:50 launchpad ollama[1650]: time=2024-12-11T15:41:50.907-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 11 15:42:04 launchpad ollama[1650]: [GIN] 2024/12/11 - 15:42:04 | 200 | 14.977727541s |       127.0.0.1 | POST     "/api/chat"
+Dec 11 15:42:32 launchpad ollama[1650]: time=2024-12-11T15:42:32.466-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 11 15:42:57 launchpad ollama[1650]: [GIN] 2024/12/11 - 15:42:57 | 200 | 24.726327701s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.782-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.923-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.923-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.924-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 46625"
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.924-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.924-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 10:38:15 launchpad ollama[1650]: time=2024-12-12T10:38:15.924-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 10:38:15 launchpad ollama[525723]: INFO [main] build info | build=0 commit="unknown" tid="140242544910336" timestamp=1734028695
+Dec 12 10:38:15 launchpad ollama[525723]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140242544910336" timestamp=1734028695 total_threads=16
+Dec 12 10:38:15 launchpad ollama[525723]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46625" tid="140242544910336" timestamp=1734028695
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 10:38:15 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 10:38:15 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 10:38:15 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 10:38:15 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 10:38:15 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 10:38:15 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 10:38:16 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: time=2024-12-12T10:38:16.227-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 10:38:16 launchpad ollama[1650]: llm_load_tensors: offloading 39 repeating layers to GPU
+Dec 12 10:38:16 launchpad ollama[1650]: llm_load_tensors: offloaded 39/41 layers to GPU
+Dec 12 10:38:16 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 10:38:16 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 10:38:16 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 15
+Dec 12 10:38:17 launchpad ollama[525723]: INFO [main] model loaded | tid="140242544910336" timestamp=1734028697
+Dec 12 10:38:17 launchpad ollama[1650]: time=2024-12-12T10:38:17.230-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Dec 12 10:38:33 launchpad ollama[1650]: [GIN] 2024/12/12 - 10:38:33 | 200 | 18.154729699s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 10:39:27 launchpad ollama[1650]: time=2024-12-12T10:39:27.181-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 10:39:38 launchpad ollama[1650]: [GIN] 2024/12/12 - 10:39:38 | 200 | 11.712285197s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.677-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.822-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.3 GiB" free_swap="68.9 GiB"
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.823-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.824-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 41259"
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.824-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.824-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 14:07:12 launchpad ollama[1650]: time=2024-12-12T14:07:12.824-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 14:07:12 launchpad ollama[537723]: INFO [main] build info | build=0 commit="unknown" tid="140266838138880" timestamp=1734041232
+Dec 12 14:07:12 launchpad ollama[537723]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140266838138880" timestamp=1734041232 total_threads=16
+Dec 12 14:07:12 launchpad ollama[537723]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41259" tid="140266838138880" timestamp=1734041232
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 14:07:12 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 14:07:12 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 14:07:12 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 14:07:12 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 14:07:12 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 14:07:12 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: time=2024-12-12T14:07:13.128-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 14:07:13 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 14:07:13 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 14:07:13 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 14:07:13 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 14:07:13 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 14:07:13 launchpad ollama[537723]: INFO [main] model loaded | tid="140266838138880" timestamp=1734041233
+Dec 12 14:07:14 launchpad ollama[1650]: time=2024-12-12T14:07:14.131-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Dec 12 14:07:31 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:07:31 | 200 | 18.419763856s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:09:22 launchpad ollama[1650]: time=2024-12-12T14:09:22.170-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:09:33 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:09:33 | 200 | 11.519442637s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:23:20 launchpad ollama[1650]: time=2024-12-12T14:23:20.894-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.040-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.040-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.041-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 34581"
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.041-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.041-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.042-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 14:23:21 launchpad ollama[544424]: INFO [main] build info | build=0 commit="unknown" tid="139623722336256" timestamp=1734042201
+Dec 12 14:23:21 launchpad ollama[544424]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139623722336256" timestamp=1734042201 total_threads=16
+Dec 12 14:23:21 launchpad ollama[544424]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34581" tid="139623722336256" timestamp=1734042201
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 14:23:21 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 14:23:21 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 14:23:21 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 14:23:21 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 14:23:21 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 14:23:21 launchpad ollama[1650]: time=2024-12-12T14:23:21.339-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 14:23:21 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 14:23:21 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 14:23:21 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 14:23:21 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 14:23:21 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 14:23:21 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 14:23:21 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 14:23:22 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 14:23:22 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 14:23:22 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 14:23:22 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 14:23:22 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 14:23:22 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 14:23:22 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 14:23:22 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 14:23:22 launchpad ollama[544424]: INFO [main] model loaded | tid="139623722336256" timestamp=1734042202
+Dec 12 14:23:22 launchpad ollama[1650]: time=2024-12-12T14:23:22.343-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 12 14:23:32 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:23:32 | 200 | 11.464269514s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.797-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.945-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.946-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.947-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 34773"
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.947-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.947-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 14:30:38 launchpad ollama[1650]: time=2024-12-12T14:30:38.947-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 14:30:38 launchpad ollama[546598]: INFO [main] build info | build=0 commit="unknown" tid="140561075200000" timestamp=1734042638
+Dec 12 14:30:38 launchpad ollama[546598]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140561075200000" timestamp=1734042638 total_threads=16
+Dec 12 14:30:38 launchpad ollama[546598]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34773" tid="140561075200000" timestamp=1734042638
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 14:30:38 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 14:30:38 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 14:30:38 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 14:30:38 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 14:30:38 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 14:30:38 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 14:30:39 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: time=2024-12-12T14:30:39.239-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 14:30:39 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 14:30:39 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 14:30:39 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 14:30:39 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 14:30:39 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 14:30:40 launchpad ollama[546598]: INFO [main] model loaded | tid="140561075200000" timestamp=1734042640
+Dec 12 14:30:40 launchpad ollama[1650]: time=2024-12-12T14:30:40.243-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 12 14:31:02 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:31:02 | 200 | 24.058789556s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:32:38 launchpad ollama[1650]: time=2024-12-12T14:32:38.995-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:33:03 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:33:03 | 200 | 24.233151055s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.089-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.247-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.247-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.248-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 33447"
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.248-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.248-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.248-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 14:38:17 launchpad ollama[557894]: INFO [main] build info | build=0 commit="unknown" tid="139713601077248" timestamp=1734043097
+Dec 12 14:38:17 launchpad ollama[557894]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139713601077248" timestamp=1734043097 total_threads=16
+Dec 12 14:38:17 launchpad ollama[557894]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33447" tid="139713601077248" timestamp=1734043097
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 14:38:17 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 14:38:17 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 14:38:17 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 14:38:17 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 14:38:17 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 14:38:17 launchpad ollama[1650]: time=2024-12-12T14:38:17.536-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 14:38:17 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 14:38:18 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 14:38:18 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 14:38:18 launchpad ollama[557894]: INFO [main] model loaded | tid="139713601077248" timestamp=1734043098
+Dec 12 14:38:18 launchpad ollama[1650]: time=2024-12-12T14:38:18.541-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 12 14:38:33 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:38:33 | 200 | 16.183629953s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:42:33 launchpad ollama[1650]: time=2024-12-12T14:42:33.257-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:42:49 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:42:49 | 200 | 16.693109504s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:43:23 launchpad ollama[1650]: time=2024-12-12T14:43:23.051-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:43:40 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:43:40 | 200 | 17.400904173s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:46:14 launchpad ollama[1650]: time=2024-12-12T14:46:14.550-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:46:26 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:46:26 | 200 |   12.2102709s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:48:06 launchpad ollama[1650]: time=2024-12-12T14:48:06.806-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:48:25 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:48:25 | 200 | 18.215535691s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.506-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.659-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.659-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.660-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 44653"
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.660-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.660-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.661-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 14:56:52 launchpad ollama[575346]: INFO [main] build info | build=0 commit="unknown" tid="140114165587968" timestamp=1734044212
+Dec 12 14:56:52 launchpad ollama[575346]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140114165587968" timestamp=1734044212 total_threads=16
+Dec 12 14:56:52 launchpad ollama[575346]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44653" tid="140114165587968" timestamp=1734044212
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 14:56:52 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 14:56:52 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 14:56:52 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 14:56:52 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 14:56:52 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 14:56:52 launchpad ollama[1650]: time=2024-12-12T14:56:52.954-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 14:56:52 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 14:56:53 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 14:56:53 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 14:56:53 launchpad ollama[575346]: INFO [main] model loaded | tid="140114165587968" timestamp=1734044213
+Dec 12 14:56:53 launchpad ollama[1650]: time=2024-12-12T14:56:53.958-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 12 14:57:40 launchpad ollama[1650]: [GIN] 2024/12/12 - 14:57:40 | 200 | 47.735583444s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 15:00:42 launchpad ollama[1650]: time=2024-12-12T15:00:42.823-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 15:01:25 launchpad ollama[1650]: [GIN] 2024/12/12 - 15:01:25 | 200 | 42.737194334s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.290-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.440-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.440-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.441-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 35177"
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.442-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.442-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.442-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 15:54:20 launchpad ollama[599855]: INFO [main] build info | build=0 commit="unknown" tid="140033460981760" timestamp=1734047660
+Dec 12 15:54:20 launchpad ollama[599855]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140033460981760" timestamp=1734047660 total_threads=16
+Dec 12 15:54:20 launchpad ollama[599855]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35177" tid="140033460981760" timestamp=1734047660
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 15:54:20 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 15:54:20 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 15:54:20 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 15:54:20 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 15:54:20 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 15:54:20 launchpad ollama[1650]: time=2024-12-12T15:54:20.732-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 15:54:20 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 15:54:21 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 15:54:21 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 15:54:21 launchpad ollama[599855]: INFO [main] model loaded | tid="140033460981760" timestamp=1734047661
+Dec 12 15:54:21 launchpad ollama[1650]: time=2024-12-12T15:54:21.736-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 12 15:54:46 launchpad ollama[1650]: [GIN] 2024/12/12 - 15:54:46 | 200 | 26.277009717s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.303-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.453-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.453-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.454-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 34969"
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.454-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.454-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.454-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 16:12:08 launchpad ollama[607541]: INFO [main] build info | build=0 commit="unknown" tid="140453608828928" timestamp=1734048728
+Dec 12 16:12:08 launchpad ollama[607541]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140453608828928" timestamp=1734048728 total_threads=16
+Dec 12 16:12:08 launchpad ollama[607541]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34969" tid="140453608828928" timestamp=1734048728
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 16:12:08 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 16:12:08 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 16:12:08 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 16:12:08 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 16:12:08 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 16:12:08 launchpad ollama[1650]: time=2024-12-12T16:12:08.752-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 16:12:08 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 16:12:09 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 16:12:09 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 16:12:09 launchpad ollama[607541]: INFO [main] model loaded | tid="140453608828928" timestamp=1734048729
+Dec 12 16:12:09 launchpad ollama[1650]: time=2024-12-12T16:12:09.756-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 12 16:12:22 launchpad ollama[1650]: [GIN] 2024/12/12 - 16:12:22 | 200 | 14.324179286s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 16:13:13 launchpad ollama[1650]: time=2024-12-12T16:13:13.285-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 16:13:22 launchpad ollama[1650]: [GIN] 2024/12/12 - 16:13:22 | 200 |  9.382966715s |       127.0.0.1 | POST     "/api/chat"
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.810-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.975-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.7 GiB" free_swap="68.9 GiB"
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.975-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.977-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 43449"
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.977-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.977-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 12 16:37:28 launchpad ollama[1650]: time=2024-12-12T16:37:28.977-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 12 16:37:29 launchpad ollama[615372]: INFO [main] build info | build=0 commit="unknown" tid="140152331472896" timestamp=1734050249
+Dec 12 16:37:29 launchpad ollama[615372]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140152331472896" timestamp=1734050249 total_threads=16
+Dec 12 16:37:29 launchpad ollama[615372]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43449" tid="140152331472896" timestamp=1734050249
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - type  f32:   81 tensors
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - type q4_0:  281 tensors
+Dec 12 16:37:29 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 3
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V2
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: vocab type       = SPM
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 32016
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 5120
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 40
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_head           = 40
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 40
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 1
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 13824
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: model type       = 13B
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: model params     = 13.02 B
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: general.name     = codellama
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 1 ''
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 2 ''
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: UNK token        = 0 ''
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_print_meta: max token length = 48
+Dec 12 16:37:29 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 12 16:37:29 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 12 16:37:29 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 12 16:37:29 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: time=2024-12-12T16:37:29.283-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 2048
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 12 16:37:29 launchpad ollama[1650]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1286
+Dec 12 16:37:29 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 26
+Dec 12 16:37:30 launchpad ollama[615372]: INFO [main] model loaded | tid="140152331472896" timestamp=1734050250
+Dec 12 16:37:30 launchpad ollama[1650]: time=2024-12-12T16:37:30.286-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Dec 12 16:37:47 launchpad ollama[1650]: [GIN] 2024/12/12 - 16:37:47 | 200 | 18.254656316s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 15:56:33 launchpad ollama[1650]: [GIN] 2024/12/13 - 15:56:33 | 200 |      16.283µs |       127.0.0.1 | HEAD     "/"
+Dec 13 15:56:33 launchpad ollama[1650]: [GIN] 2024/12/13 - 15:56:33 | 200 |   12.805255ms |       127.0.0.1 | POST     "/api/show"
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.314-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9348120576 required="6.2 GiB"
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.314-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.314-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.315-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44243"
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.316-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.316-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.316-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 13 15:56:33 launchpad ollama[636600]: INFO [main] build info | build=0 commit="unknown" tid="139786189860864" timestamp=1734134193
+Dec 13 15:56:33 launchpad ollama[636600]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139786189860864" timestamp=1734134193 total_threads=16
+Dec 13 15:56:33 launchpad ollama[636600]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44243" tid="139786189860864" timestamp=1734134193
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 13 15:56:33 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 13 15:56:33 launchpad ollama[1650]: time=2024-12-13T15:56:33.566-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 13 15:56:33 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 13 15:56:33 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 13 15:56:33 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 13 15:56:33 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 13 15:56:33 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 13 15:56:34 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 13 15:56:34 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 13 15:56:34 launchpad ollama[636600]: INFO [main] model loaded | tid="139786189860864" timestamp=1734134194
+Dec 13 15:56:34 launchpad ollama[1650]: time=2024-12-13T15:56:34.320-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 13 15:56:34 launchpad ollama[1650]: [GIN] 2024/12/13 - 15:56:34 | 200 |  1.171286506s |       127.0.0.1 | POST     "/api/generate"
+Dec 13 16:01:29 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:01:29 | 200 |  6.136587742s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:02:33 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:02:33 | 200 |  4.447422642s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:04:01 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:04:01 | 200 |  5.357672267s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.267-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9507045376 required="6.2 GiB"
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.268-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.269-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.270-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42889"
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.270-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.270-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.270-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 13 16:11:15 launchpad ollama[636840]: INFO [main] build info | build=0 commit="unknown" tid="140604394651648" timestamp=1734135075
+Dec 13 16:11:15 launchpad ollama[636840]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140604394651648" timestamp=1734135075 total_threads=16
+Dec 13 16:11:15 launchpad ollama[636840]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42889" tid="140604394651648" timestamp=1734135075
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 13 16:11:15 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 13 16:11:15 launchpad ollama[1650]: time=2024-12-13T16:11:15.521-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 13 16:11:15 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 13 16:11:15 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 13 16:11:15 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 13 16:11:15 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 13 16:11:15 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 13 16:11:16 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 13 16:11:16 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 13 16:11:16 launchpad ollama[636840]: INFO [main] model loaded | tid="140604394651648" timestamp=1734135076
+Dec 13 16:11:16 launchpad ollama[1650]: time=2024-12-13T16:11:16.274-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 13 16:11:23 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:11:23 | 200 |  8.898060255s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:15:11 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:15:11 | 200 |  6.804123041s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.205-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9519038464 required="6.2 GiB"
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.205-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.1 GiB" free_swap="68.9 GiB"
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.205-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.206-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39921"
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.206-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.206-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.207-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 13 16:33:55 launchpad ollama[637115]: INFO [main] build info | build=0 commit="unknown" tid="139850658385920" timestamp=1734136435
+Dec 13 16:33:55 launchpad ollama[637115]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139850658385920" timestamp=1734136435 total_threads=16
+Dec 13 16:33:55 launchpad ollama[637115]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39921" tid="139850658385920" timestamp=1734136435
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 13 16:33:55 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 13 16:33:55 launchpad ollama[1650]: time=2024-12-13T16:33:55.458-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 13 16:33:55 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 13 16:33:55 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 13 16:33:55 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 13 16:33:55 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 13 16:33:55 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 13 16:33:56 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 13 16:33:56 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 13 16:33:56 launchpad ollama[637115]: INFO [main] model loaded | tid="139850658385920" timestamp=1734136436
+Dec 13 16:33:56 launchpad ollama[1650]: time=2024-12-13T16:33:56.210-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 13 16:34:02 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:34:02 | 200 |  7.432932971s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.720-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9371385856 required="6.2 GiB"
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.720-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.721-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.722-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36199"
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.722-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.722-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.722-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 13 16:42:16 launchpad ollama[637303]: INFO [main] build info | build=0 commit="unknown" tid="140314999214080" timestamp=1734136936
+Dec 13 16:42:16 launchpad ollama[637303]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140314999214080" timestamp=1734136936 total_threads=16
+Dec 13 16:42:16 launchpad ollama[637303]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36199" tid="140314999214080" timestamp=1734136936
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 13 16:42:16 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 13 16:42:16 launchpad ollama[1650]: time=2024-12-13T16:42:16.973-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 13 16:42:16 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 13 16:42:17 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 13 16:42:17 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 13 16:42:17 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 13 16:42:17 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 13 16:42:17 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 13 16:42:17 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 13 16:42:17 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 13 16:42:17 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 13 16:42:17 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 13 16:42:17 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 13 16:42:17 launchpad ollama[637303]: INFO [main] model loaded | tid="140314999214080" timestamp=1734136937
+Dec 13 16:42:17 launchpad ollama[1650]: time=2024-12-13T16:42:17.726-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 13 16:42:22 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:42:22 | 200 |  5.898577155s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:43:36 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:43:36 | 200 |  2.113112404s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:47:06 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:47:06 | 200 |  3.491798481s |       127.0.0.1 | POST     "/api/chat"
+Dec 13 16:49:11 launchpad ollama[1650]: [GIN] 2024/12/13 - 16:49:11 | 200 |  878.893601ms |       127.0.0.1 | POST     "/api/chat"
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.541-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8989310976 required="6.2 GiB"
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.541-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.542-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.543-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38123"
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.543-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.543-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.543-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 14 08:42:09 launchpad ollama[642606]: INFO [main] build info | build=0 commit="unknown" tid="140598098186240" timestamp=1734194529
+Dec 14 08:42:09 launchpad ollama[642606]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140598098186240" timestamp=1734194529 total_threads=16
+Dec 14 08:42:09 launchpad ollama[642606]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38123" tid="140598098186240" timestamp=1734194529
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 14 08:42:09 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 14 08:42:09 launchpad ollama[1650]: time=2024-12-14T08:42:09.794-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 14 08:42:09 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 14 08:42:09 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 14 08:42:09 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 14 08:42:09 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 14 08:42:09 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 14 08:42:10 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 14 08:42:10 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 14 08:42:10 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 14 08:42:10 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 14 08:42:10 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 14 08:42:10 launchpad ollama[642606]: INFO [main] model loaded | tid="140598098186240" timestamp=1734194530
+Dec 14 08:42:10 launchpad ollama[1650]: time=2024-12-14T08:42:10.547-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 14 08:42:26 launchpad ollama[1650]: [GIN] 2024/12/14 - 08:42:26 | 200 | 16.798782892s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.229-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8989638656 required="6.2 GiB"
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.229-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.230-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.231-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39261"
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.231-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.231-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.231-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 14 08:49:06 launchpad ollama[642642]: INFO [main] build info | build=0 commit="unknown" tid="140004482441216" timestamp=1734194946
+Dec 14 08:49:06 launchpad ollama[642642]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140004482441216" timestamp=1734194946 total_threads=16
+Dec 14 08:49:06 launchpad ollama[642642]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39261" tid="140004482441216" timestamp=1734194946
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 14 08:49:06 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 14 08:49:06 launchpad ollama[1650]: time=2024-12-14T08:49:06.482-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 14 08:49:06 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 14 08:49:06 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 14 08:49:06 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 14 08:49:06 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 14 08:49:06 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 14 08:49:07 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 14 08:49:07 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 14 08:49:07 launchpad ollama[642642]: INFO [main] model loaded | tid="140004482441216" timestamp=1734194947
+Dec 14 08:49:07 launchpad ollama[1650]: time=2024-12-14T08:49:07.234-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 14 08:49:17 launchpad ollama[1650]: [GIN] 2024/12/14 - 08:49:17 | 200 | 11.443394435s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.155-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8989900800 required="6.2 GiB"
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.155-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.155-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.156-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40259"
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.156-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.156-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.157-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 14 09:00:53 launchpad ollama[642700]: INFO [main] build info | build=0 commit="unknown" tid="140141380591616" timestamp=1734195653
+Dec 14 09:00:53 launchpad ollama[642700]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140141380591616" timestamp=1734195653 total_threads=16
+Dec 14 09:00:53 launchpad ollama[642700]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40259" tid="140141380591616" timestamp=1734195653
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 14 09:00:53 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 14 09:00:53 launchpad ollama[1650]: time=2024-12-14T09:00:53.408-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 14 09:00:53 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 14 09:00:53 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 14 09:00:53 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 14 09:00:53 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 14 09:00:53 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 14 09:00:54 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 14 09:00:54 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 14 09:00:54 launchpad ollama[642700]: INFO [main] model loaded | tid="140141380591616" timestamp=1734195654
+Dec 14 09:00:54 launchpad ollama[1650]: time=2024-12-14T09:00:54.161-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 14 09:01:04 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:01:04 | 200 | 11.281515021s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:05:25 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:05:25 | 200 | 10.312164917s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.228-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8989638656 required="6.2 GiB"
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.228-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.228-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.229-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46307"
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.229-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.229-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.229-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 14 09:14:08 launchpad ollama[642758]: INFO [main] build info | build=0 commit="unknown" tid="139857237069824" timestamp=1734196448
+Dec 14 09:14:08 launchpad ollama[642758]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139857237069824" timestamp=1734196448 total_threads=16
+Dec 14 09:14:08 launchpad ollama[642758]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46307" tid="139857237069824" timestamp=1734196448
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 14 09:14:08 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 14 09:14:08 launchpad ollama[1650]: time=2024-12-14T09:14:08.481-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 14 09:14:08 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 14 09:14:08 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 14 09:14:08 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 14 09:14:08 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 14 09:14:08 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 14 09:14:09 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 14 09:14:09 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 14 09:14:09 launchpad ollama[642758]: INFO [main] model loaded | tid="139857237069824" timestamp=1734196449
+Dec 14 09:14:09 launchpad ollama[1650]: time=2024-12-14T09:14:09.234-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 14 09:14:18 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:14:18 | 200 | 10.829763262s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.571-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8841265152 required="6.2 GiB"
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.571-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.572-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.573-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1292331900/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36277"
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.573-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.573-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.573-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 14 09:21:32 launchpad ollama[643754]: INFO [main] build info | build=0 commit="unknown" tid="139682762309632" timestamp=1734196892
+Dec 14 09:21:32 launchpad ollama[643754]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139682762309632" timestamp=1734196892 total_threads=16
+Dec 14 09:21:32 launchpad ollama[643754]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36277" tid="139682762309632" timestamp=1734196892
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - type  f32:   65 tensors
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - type q4_0:  225 tensors
+Dec 14 09:21:32 launchpad ollama[1650]: llama_model_loader: - type q6_K:    1 tensors
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_vocab: special tokens cache size = 256
+Dec 14 09:21:32 launchpad ollama[1650]: time=2024-12-14T09:21:32.824-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: arch             = llama
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: vocab type       = BPE
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_vocab          = 128256
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_merges         = 280147
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: vocab_only       = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_embd           = 4096
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_layer          = 32
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_head           = 32
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_head_kv        = 8
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_rot            = 128
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_swa            = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_gqa            = 4
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_ff             = 14336
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_expert         = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_expert_used    = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: causal attn      = 1
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: pooling type     = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: rope type        = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: rope scaling     = linear
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: freq_scale_train = 1
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: ssm_d_state      = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: model type       = 8B
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: model ftype      = Q4_0
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: model params     = 8.03 B
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_print_meta: max token length = 256
+Dec 14 09:21:32 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 14 09:21:32 launchpad ollama[1650]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 14 09:21:32 launchpad ollama[1650]: ggml_cuda_init: found 1 CUDA devices:
+Dec 14 09:21:32 launchpad ollama[1650]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 14 09:21:32 launchpad ollama[1650]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 14 09:21:33 launchpad ollama[1650]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 14 09:21:33 launchpad ollama[1650]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 14 09:21:33 launchpad ollama[1650]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: n_ctx      = 8192
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: n_batch    = 512
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: n_ubatch   = 512
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: flash_attn = 0
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: freq_scale = 1
+Dec 14 09:21:33 launchpad ollama[1650]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: graph nodes  = 1030
+Dec 14 09:21:33 launchpad ollama[1650]: llama_new_context_with_model: graph splits = 2
+Dec 14 09:21:33 launchpad ollama[643754]: INFO [main] model loaded | tid="139682762309632" timestamp=1734196893
+Dec 14 09:21:33 launchpad ollama[1650]: time=2024-12-14T09:21:33.576-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 14 09:21:40 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:21:40 | 200 |  7.608779732s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:22:59 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:22:59 | 200 |  6.126859969s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:25:00 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:25:00 | 200 |  4.930539019s |       127.0.0.1 | POST     "/api/chat"
+Dec 14 09:28:02 launchpad ollama[1650]: [GIN] 2024/12/14 - 09:28:02 | 200 |  4.348262508s |       127.0.0.1 | POST     "/api/chat"
+Dec 15 08:15:59 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 15 08:15:59 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 15 08:15:59 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 15 08:15:59 launchpad systemd[1]: ollama.service: Consumed 1h 49min 39.416s CPU time, 12.6G memory peak, 11.4G read from disk, 508.3M written to disk.
+-- Boot 7f8995ca71744c71aa779feae177374a --
+Dec 15 08:16:34 launchpad systemd[1]: Starting Server for local large language models...
+Dec 15 08:16:34 launchpad systemd[1]: Started Server for local large language models.
+Dec 15 08:16:34 launchpad ollama[1569]: 2024/12/15 08:16:34 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 15 08:16:34 launchpad ollama[1569]: time=2024-12-15T08:16:34.509-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 15 08:16:34 launchpad ollama[1569]: time=2024-12-15T08:16:34.513-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 15 08:16:34 launchpad ollama[1569]: time=2024-12-15T08:16:34.514-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 15 08:16:34 launchpad ollama[1569]: time=2024-12-15T08:16:34.515-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2435835559/runners
+Dec 15 08:16:37 launchpad ollama[1569]: time=2024-12-15T08:16:37.509-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 15 08:16:37 launchpad ollama[1569]: time=2024-12-15T08:16:37.509-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 15 08:16:37 launchpad ollama[1569]: time=2024-12-15T08:16:37.509-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:16:37 launchpad ollama[1569]: time=2024-12-15T08:16:37.509-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:16:37 launchpad ollama[1569]: time=2024-12-15T08:16:37.509-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:16:37 launchpad ollama[1569]: time=2024-12-15T08:16:37.721-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 15 08:22:59 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 15 08:23:00 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 15 08:23:00 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 15 08:23:00 launchpad systemd[1]: ollama.service: Consumed 3.444s CPU time, 790.5M memory peak, 233.9M read from disk, 508.1M written to disk.
+-- Boot 587ab3b258734cc5b4c9607b461135f6 --
+Dec 15 08:23:32 launchpad systemd[1]: Starting Server for local large language models...
+Dec 15 08:23:32 launchpad systemd[1]: Started Server for local large language models.
+Dec 15 08:23:32 launchpad ollama[1568]: 2024/12/15 08:23:32 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 15 08:23:32 launchpad ollama[1568]: time=2024-12-15T08:23:32.476-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 15 08:23:32 launchpad ollama[1568]: time=2024-12-15T08:23:32.479-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 15 08:23:32 launchpad ollama[1568]: time=2024-12-15T08:23:32.480-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 15 08:23:32 launchpad ollama[1568]: time=2024-12-15T08:23:32.482-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3123740476/runners
+Dec 15 08:23:35 launchpad ollama[1568]: time=2024-12-15T08:23:35.414-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Dec 15 08:23:35 launchpad ollama[1568]: time=2024-12-15T08:23:35.414-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 15 08:23:35 launchpad ollama[1568]: time=2024-12-15T08:23:35.414-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:23:35 launchpad ollama[1568]: time=2024-12-15T08:23:35.414-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:23:35 launchpad ollama[1568]: time=2024-12-15T08:23:35.414-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:23:35 launchpad ollama[1568]: time=2024-12-15T08:23:35.651-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 15 08:27:09 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 15 08:27:09 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 15 08:27:09 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 15 08:27:09 launchpad systemd[1]: ollama.service: Consumed 3.431s CPU time, 790.8M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot 7f13a190f66242fd900f6239a6cfd33c --
+Dec 15 08:27:42 launchpad systemd[1]: Starting Server for local large language models...
+Dec 15 08:27:42 launchpad systemd[1]: Started Server for local large language models.
+Dec 15 08:27:42 launchpad ollama[1572]: 2024/12/15 08:27:42 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 15 08:27:42 launchpad ollama[1572]: time=2024-12-15T08:27:42.977-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 15 08:27:42 launchpad ollama[1572]: time=2024-12-15T08:27:42.981-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 15 08:27:42 launchpad ollama[1572]: time=2024-12-15T08:27:42.982-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 15 08:27:42 launchpad ollama[1572]: time=2024-12-15T08:27:42.984-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4062981542/runners
+Dec 15 08:27:45 launchpad ollama[1572]: time=2024-12-15T08:27:45.922-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 15 08:27:45 launchpad ollama[1572]: time=2024-12-15T08:27:45.922-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 15 08:27:45 launchpad ollama[1572]: time=2024-12-15T08:27:45.922-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:27:45 launchpad ollama[1572]: time=2024-12-15T08:27:45.922-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:27:45 launchpad ollama[1572]: time=2024-12-15T08:27:45.922-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 15 08:27:46 launchpad ollama[1572]: time=2024-12-15T08:27:46.152-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 16 08:07:07 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:07:07 | 200 |    1.732896ms |       127.0.0.1 | HEAD     "/"
+Dec 16 08:07:07 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:07:07 | 200 |   25.578353ms |       127.0.0.1 | POST     "/api/show"
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.213-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.420-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="66.6 GiB"
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.421-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.428-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 40921"
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.429-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.429-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.430-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 08:07:07 launchpad ollama[132786]: INFO [main] build info | build=0 commit="unknown" tid="139941389623296" timestamp=1734365227
+Dec 16 08:07:07 launchpad ollama[132786]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139941389623296" timestamp=1734365227 total_threads=16
+Dec 16 08:07:07 launchpad ollama[132786]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40921" tid="139941389623296" timestamp=1734365227
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 08:07:07 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 08:07:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 08:07:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 08:07:07 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 08:07:07 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 08:07:07 launchpad ollama[1572]: time=2024-12-16T08:07:07.681-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 08:07:07 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 08:07:15 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 08:07:15 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 08:07:15 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 08:07:15 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 08:07:15 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 08:07:15 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 08:07:15 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 08:07:16 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 08:07:16 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 08:07:16 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 08:07:16 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 08:07:16 launchpad ollama[132786]: INFO [main] model loaded | tid="139941389623296" timestamp=1734365236
+Dec 16 08:07:16 launchpad ollama[1572]: time=2024-12-16T08:07:16.211-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.78 seconds"
+Dec 16 08:07:16 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:07:16 | 200 |  9.003896893s |       127.0.0.1 | POST     "/api/generate"
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.839-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.981-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="66.6 GiB"
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.981-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.983-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 37961"
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.983-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.983-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 08:16:06 launchpad ollama[1572]: time=2024-12-16T08:16:06.983-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 08:16:07 launchpad ollama[133209]: INFO [main] build info | build=0 commit="unknown" tid="140040237510656" timestamp=1734365767
+Dec 16 08:16:07 launchpad ollama[133209]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140040237510656" timestamp=1734365767 total_threads=16
+Dec 16 08:16:07 launchpad ollama[133209]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37961" tid="140040237510656" timestamp=1734365767
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 08:16:07 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 08:16:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 08:16:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 08:16:07 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 08:16:07 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: time=2024-12-16T08:16:07.305-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 08:16:07 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 08:16:07 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 08:16:08 launchpad ollama[133209]: INFO [main] model loaded | tid="140040237510656" timestamp=1734365768
+Dec 16 08:16:08 launchpad ollama[1572]: time=2024-12-16T08:16:08.308-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.33 seconds"
+Dec 16 08:16:38 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:16:38 | 200 |  31.35051265s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.097-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.243-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="66.6 GiB"
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.244-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.245-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 41903"
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.245-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.245-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.246-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 08:21:44 launchpad ollama[140970]: INFO [main] build info | build=0 commit="unknown" tid="140577512665088" timestamp=1734366104
+Dec 16 08:21:44 launchpad ollama[140970]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140577512665088" timestamp=1734366104 total_threads=16
+Dec 16 08:21:44 launchpad ollama[140970]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41903" tid="140577512665088" timestamp=1734366104
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 08:21:44 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 08:21:44 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 08:21:44 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 08:21:44 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 08:21:44 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 08:21:44 launchpad ollama[1572]: time=2024-12-16T08:21:44.561-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 08:21:44 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 08:21:45 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 08:21:45 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 08:21:45 launchpad ollama[140970]: INFO [main] model loaded | tid="140577512665088" timestamp=1734366105
+Dec 16 08:21:45 launchpad ollama[1572]: time=2024-12-16T08:21:45.567-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 08:22:16 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:22:16 | 200 | 31.955048728s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 08:26:25 launchpad ollama[1572]: time=2024-12-16T08:26:25.556-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 08:27:12 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:27:12 | 200 | 46.798941573s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 08:28:49 launchpad ollama[1572]: time=2024-12-16T08:28:49.020-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 08:28:55 launchpad ollama[1572]: [GIN] 2024/12/16 - 08:28:55 | 200 |  6.599727412s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 13:32:21 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:32:21 | 200 |      15.818µs |       127.0.0.1 | HEAD     "/"
+Dec 16 13:32:21 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:32:21 | 200 |    3.293818ms |       127.0.0.1 | POST     "/api/show"
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.035-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.178-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.6 GiB" free_swap="66.9 GiB"
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.178-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.180-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 42837"
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.180-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.180-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.181-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 13:32:21 launchpad ollama[233429]: INFO [main] build info | build=0 commit="unknown" tid="140314879602688" timestamp=1734384741
+Dec 16 13:32:21 launchpad ollama[233429]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140314879602688" timestamp=1734384741 total_threads=16
+Dec 16 13:32:21 launchpad ollama[233429]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42837" tid="140314879602688" timestamp=1734384741
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 13:32:21 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 13:32:21 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 13:32:21 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 13:32:21 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 13:32:21 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 13:32:21 launchpad ollama[1572]: time=2024-12-16T13:32:21.494-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 13:32:21 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 13:32:22 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 13:32:22 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 13:32:22 launchpad ollama[233429]: INFO [main] model loaded | tid="140314879602688" timestamp=1734384742
+Dec 16 13:32:22 launchpad ollama[1572]: time=2024-12-16T13:32:22.497-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 13:32:22 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:32:22 | 200 |  1.465909497s |       127.0.0.1 | POST     "/api/generate"
+Dec 16 13:33:03 launchpad ollama[1572]: time=2024-12-16T13:33:03.838-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 13:33:22 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:33:22 | 200 | 18.586175577s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 13:35:13 launchpad ollama[1572]: time=2024-12-16T13:35:13.351-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 13:35:29 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:35:29 | 200 | 16.096727432s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 13:38:04 launchpad ollama[1572]: time=2024-12-16T13:38:04.986-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 13:38:21 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:38:21 | 200 | 16.795888904s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 13:39:04 launchpad ollama[1572]: time=2024-12-16T13:39:04.636-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 13:39:07 launchpad ollama[1572]: [GIN] 2024/12/16 - 13:39:07 | 200 |  2.562633713s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 16:46:07 launchpad ollama[1572]: [GIN] 2024/12/16 - 16:46:07 | 200 |      16.015µs |       127.0.0.1 | HEAD     "/"
+Dec 16 16:46:07 launchpad ollama[1572]: [GIN] 2024/12/16 - 16:46:07 | 200 |    3.401384ms |       127.0.0.1 | POST     "/api/show"
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.605-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.740-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="66.9 GiB"
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.740-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.742-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 34999"
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.742-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.742-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 16:46:07 launchpad ollama[1572]: time=2024-12-16T16:46:07.742-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 16:46:07 launchpad ollama[255053]: INFO [main] build info | build=0 commit="unknown" tid="140242733588480" timestamp=1734396367
+Dec 16 16:46:07 launchpad ollama[255053]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140242733588480" timestamp=1734396367 total_threads=16
+Dec 16 16:46:07 launchpad ollama[255053]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34999" tid="140242733588480" timestamp=1734396367
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 16:46:07 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 16:46:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 16:46:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 16:46:07 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 16:46:07 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 16:46:07 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: time=2024-12-16T16:46:08.057-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 16:46:08 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 16:46:08 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 16:46:08 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 16:46:08 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 16:46:08 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 16:46:08 launchpad ollama[255053]: INFO [main] model loaded | tid="140242733588480" timestamp=1734396368
+Dec 16 16:46:09 launchpad ollama[1572]: time=2024-12-16T16:46:09.060-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 16:46:09 launchpad ollama[1572]: [GIN] 2024/12/16 - 16:46:09 | 200 |  1.457804183s |       127.0.0.1 | POST     "/api/generate"
+Dec 16 16:47:28 launchpad ollama[1572]: time=2024-12-16T16:47:28.096-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 16:47:43 launchpad ollama[1572]: [GIN] 2024/12/16 - 16:47:43 | 200 | 15.256738627s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 16:49:14 launchpad ollama[1572]: time=2024-12-16T16:49:14.434-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 16:49:30 launchpad ollama[1572]: [GIN] 2024/12/16 - 16:49:30 | 200 | 16.045843727s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.545-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.691-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.7 GiB" free_swap="66.9 GiB"
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.692-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.693-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 43583"
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.694-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.694-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 16:58:43 launchpad ollama[1572]: time=2024-12-16T16:58:43.694-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 16:58:43 launchpad ollama[263521]: INFO [main] build info | build=0 commit="unknown" tid="140208758382592" timestamp=1734397123
+Dec 16 16:58:43 launchpad ollama[263521]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140208758382592" timestamp=1734397123 total_threads=16
+Dec 16 16:58:43 launchpad ollama[263521]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43583" tid="140208758382592" timestamp=1734397123
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 16:58:43 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 16:58:43 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 16:58:43 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 16:58:43 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 16:58:43 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 16:58:43 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: time=2024-12-16T16:58:44.004-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 16:58:44 launchpad ollama[1572]: llm_load_tensors: offloading 37 repeating layers to GPU
+Dec 16 16:58:44 launchpad ollama[1572]: llm_load_tensors: offloaded 37/41 layers to GPU
+Dec 16 16:58:44 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 16:58:44 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 16:58:44 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 37
+Dec 16 16:58:44 launchpad ollama[263521]: INFO [main] model loaded | tid="140208758382592" timestamp=1734397124
+Dec 16 16:58:45 launchpad ollama[1572]: time=2024-12-16T16:58:45.008-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Dec 16 16:58:59 launchpad ollama[1572]: [GIN] 2024/12/16 - 16:58:59 | 200 | 16.429269815s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.221-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.371-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.1 GiB" free_swap="66.9 GiB"
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.371-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.372-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 39603"
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.372-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.372-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.372-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 17:18:30 launchpad ollama[267913]: INFO [main] build info | build=0 commit="unknown" tid="140615269732352" timestamp=1734398310
+Dec 16 17:18:30 launchpad ollama[267913]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140615269732352" timestamp=1734398310 total_threads=16
+Dec 16 17:18:30 launchpad ollama[267913]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39603" tid="140615269732352" timestamp=1734398310
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 17:18:30 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 17:18:30 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 17:18:30 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 17:18:30 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 17:18:30 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 17:18:30 launchpad ollama[1572]: time=2024-12-16T17:18:30.690-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_tensors: offloading 37 repeating layers to GPU
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_tensors: offloaded 37/41 layers to GPU
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 17:18:30 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 17:18:31 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 17:18:31 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 37
+Dec 16 17:18:31 launchpad ollama[267913]: INFO [main] model loaded | tid="140615269732352" timestamp=1734398311
+Dec 16 17:18:31 launchpad ollama[1572]: time=2024-12-16T17:18:31.694-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 17:18:46 launchpad ollama[1572]: [GIN] 2024/12/16 - 17:18:46 | 200 | 16.255247256s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 17:21:03 launchpad ollama[1572]: time=2024-12-16T17:21:03.361-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 17:21:19 launchpad ollama[1572]: [GIN] 2024/12/16 - 17:21:19 | 200 | 16.126206061s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.131-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.280-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.1 GiB" free_swap="66.9 GiB"
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.280-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.281-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 46803"
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.282-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.282-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.282-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 17:30:32 launchpad ollama[275350]: INFO [main] build info | build=0 commit="unknown" tid="139700488409088" timestamp=1734399032
+Dec 16 17:30:32 launchpad ollama[275350]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139700488409088" timestamp=1734399032 total_threads=16
+Dec 16 17:30:32 launchpad ollama[275350]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46803" tid="139700488409088" timestamp=1734399032
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 17:30:32 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 17:30:32 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 17:30:32 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 17:30:32 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 17:30:32 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 17:30:32 launchpad ollama[1572]: time=2024-12-16T17:30:32.598-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 17:30:32 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 17:30:33 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 17:30:33 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 17:30:33 launchpad ollama[275350]: INFO [main] model loaded | tid="139700488409088" timestamp=1734399033
+Dec 16 17:30:33 launchpad ollama[1572]: time=2024-12-16T17:30:33.601-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 17:30:55 launchpad ollama[1572]: [GIN] 2024/12/16 - 17:30:55 | 200 |  23.71460947s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.789-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.941-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.1 GiB" free_swap="66.9 GiB"
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.941-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.942-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 35353"
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.943-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.943-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 17:37:58 launchpad ollama[1572]: time=2024-12-16T17:37:58.943-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 17:37:58 launchpad ollama[279987]: INFO [main] build info | build=0 commit="unknown" tid="140131594788864" timestamp=1734399478
+Dec 16 17:37:58 launchpad ollama[279987]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140131594788864" timestamp=1734399478 total_threads=16
+Dec 16 17:37:58 launchpad ollama[279987]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35353" tid="140131594788864" timestamp=1734399478
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 17:37:58 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 17:37:58 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 17:37:58 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 17:37:58 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 17:37:58 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 17:37:58 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 17:37:59 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: time=2024-12-16T17:37:59.260-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 17:37:59 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 16 17:37:59 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 16 17:37:59 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 17:37:59 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 17:37:59 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 16 17:38:00 launchpad ollama[279987]: INFO [main] model loaded | tid="140131594788864" timestamp=1734399480
+Dec 16 17:38:00 launchpad ollama[1572]: time=2024-12-16T17:38:00.266-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 17:38:09 launchpad ollama[1572]: [GIN] 2024/12/16 - 17:38:09 | 200 | 11.084683925s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 17:40:26 launchpad ollama[1572]: time=2024-12-16T17:40:26.529-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 17:40:45 launchpad ollama[1572]: [GIN] 2024/12/16 - 17:40:45 | 200 | 18.796467048s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 18:13:44 launchpad ollama[1572]: time=2024-12-16T18:13:44.876-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.025-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="66.9 GiB"
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.026-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.027-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 36911"
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.027-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.027-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.027-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 16 18:13:45 launchpad ollama[286600]: INFO [main] build info | build=0 commit="unknown" tid="139918625984512" timestamp=1734401625
+Dec 16 18:13:45 launchpad ollama[286600]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139918625984512" timestamp=1734401625 total_threads=16
+Dec 16 18:13:45 launchpad ollama[286600]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36911" tid="139918625984512" timestamp=1734401625
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 16 18:13:45 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 16 18:13:45 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 16 18:13:45 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 16 18:13:45 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 16 18:13:45 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 16 18:13:45 launchpad ollama[1572]: time=2024-12-16T18:13:45.341-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 16 18:13:45 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 16 18:13:45 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 16 18:13:45 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 16 18:13:45 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 16 18:13:45 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 16 18:13:45 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 16 18:13:45 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 16 18:13:46 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 16 18:13:46 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 16 18:13:46 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 16 18:13:46 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 16 18:13:46 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 16 18:13:46 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 16 18:13:46 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 16 18:13:46 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 26
+Dec 16 18:13:46 launchpad ollama[286600]: INFO [main] model loaded | tid="139918625984512" timestamp=1734401626
+Dec 16 18:13:46 launchpad ollama[1572]: time=2024-12-16T18:13:46.345-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 16 18:14:06 launchpad ollama[1572]: [GIN] 2024/12/16 - 18:14:06 | 200 | 21.318673353s |       127.0.0.1 | POST     "/api/chat"
+Dec 16 18:14:27 launchpad ollama[1572]: time=2024-12-16T18:14:27.247-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 16 18:14:38 launchpad ollama[1572]: [GIN] 2024/12/16 - 18:14:38 | 200 | 11.437179683s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.573-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.727-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.4 GiB" free_swap="67.0 GiB"
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.727-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.728-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 34391"
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.728-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.728-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 17 07:49:37 launchpad ollama[1572]: time=2024-12-17T07:49:37.729-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 17 07:49:37 launchpad ollama[299296]: INFO [main] build info | build=0 commit="unknown" tid="140325011386368" timestamp=1734450577
+Dec 17 07:49:37 launchpad ollama[299296]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140325011386368" timestamp=1734450577 total_threads=16
+Dec 17 07:49:37 launchpad ollama[299296]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34391" tid="140325011386368" timestamp=1734450577
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 17 07:49:37 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 17 07:49:37 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 17 07:49:37 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 17 07:49:37 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 17 07:49:37 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 17 07:49:37 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: time=2024-12-17T07:49:38.044-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 17 07:49:38 launchpad ollama[1572]: llm_load_tensors: offloading 36 repeating layers to GPU
+Dec 17 07:49:38 launchpad ollama[1572]: llm_load_tensors: offloaded 36/41 layers to GPU
+Dec 17 07:49:38 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 17 07:49:38 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 17 07:49:38 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 48
+Dec 17 07:49:38 launchpad ollama[299296]: INFO [main] model loaded | tid="140325011386368" timestamp=1734450578
+Dec 17 07:49:39 launchpad ollama[1572]: time=2024-12-17T07:49:39.047-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 17 07:50:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 07:50:12 | 200 | 34.923115772s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 07:51:37 launchpad ollama[1572]: time=2024-12-17T07:51:37.603-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 07:51:53 launchpad ollama[1572]: [GIN] 2024/12/17 - 07:51:53 | 200 | 16.138929914s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 07:52:55 launchpad ollama[1572]: time=2024-12-17T07:52:55.236-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 07:53:31 launchpad ollama[1572]: [GIN] 2024/12/17 - 07:53:31 | 200 | 35.837838754s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 07:55:48 launchpad ollama[1572]: time=2024-12-17T07:55:48.658-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 07:56:04 launchpad ollama[1572]: [GIN] 2024/12/17 - 07:56:04 | 200 | 15.734550328s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 07:59:28 launchpad ollama[1572]: time=2024-12-17T07:59:28.613-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 07:59:40 launchpad ollama[1572]: [GIN] 2024/12/17 - 07:59:40 | 200 | 11.434446802s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:20:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:20:34 | 200 |      15.603µs |       127.0.0.1 | HEAD     "/"
+Dec 17 11:20:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:20:34 | 200 |    5.159978ms |       127.0.0.1 | GET      "/api/tags"
+Dec 17 11:25:09 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:25:09 | 404 |     112.388µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:27:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:27:30 | 404 |     201.257µs |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:28:22 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:28:22 | 404 |     112.055µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:28:45 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:28:45 | 404 |     143.418µs |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:29:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:29:30 | 404 |      94.585µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:29:47 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:29:47 | 404 |     148.802µs |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:32:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:32:12 | 404 |     146.222µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:33:02 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:33:02 | 404 |      80.916µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:34:02 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:34:02 | 200 |      16.415µs |       127.0.0.1 | HEAD     "/"
+Dec 17 11:34:02 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:34:02 | 200 |    1.756919ms |       127.0.0.1 | GET      "/api/tags"
+Dec 17 11:34:35 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:34:35 | 404 |      84.811µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:34:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:34:49 | 200 |    3.470226ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.186-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.346-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.9 GiB" free_swap="67.0 GiB"
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.346-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.348-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4062981542/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 41259"
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.348-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.348-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.348-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 17 11:35:07 launchpad ollama[861591]: INFO [main] build info | build=0 commit="unknown" tid="140081273618432" timestamp=1734464107
+Dec 17 11:35:07 launchpad ollama[861591]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140081273618432" timestamp=1734464107 total_threads=16
+Dec 17 11:35:07 launchpad ollama[861591]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41259" tid="140081273618432" timestamp=1734464107
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - type  f32:   81 tensors
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - type q4_0:  281 tensors
+Dec 17 11:35:07 launchpad ollama[1572]: llama_model_loader: - type q6_K:    1 tensors
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_vocab: special tokens cache size = 3
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: format           = GGUF V2
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: arch             = llama
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: vocab type       = SPM
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_vocab          = 32016
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_merges         = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: vocab_only       = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_embd           = 5120
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_layer          = 40
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_head           = 40
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_head_kv        = 40
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_rot            = 128
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_swa            = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_gqa            = 1
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_ff             = 13824
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_expert         = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_expert_used    = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: causal attn      = 1
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: pooling type     = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: rope type        = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: rope scaling     = linear
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: freq_scale_train = 1
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: ssm_d_state      = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: model type       = 13B
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: model ftype      = Q4_0
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: model params     = 13.02 B
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: general.name     = codellama
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: BOS token        = 1 ''
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: EOS token        = 2 ''
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: UNK token        = 0 ''
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_print_meta: max token length = 48
+Dec 17 11:35:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 17 11:35:07 launchpad ollama[1572]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 17 11:35:07 launchpad ollama[1572]: ggml_cuda_init: found 1 CUDA devices:
+Dec 17 11:35:07 launchpad ollama[1572]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 17 11:35:07 launchpad ollama[1572]: time=2024-12-17T11:35:07.671-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_tensors: offloading 37 repeating layers to GPU
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_tensors: offloaded 37/41 layers to GPU
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 17 11:35:07 launchpad ollama[1572]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: n_ctx      = 2048
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: n_batch    = 512
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: n_ubatch   = 512
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: flash_attn = 0
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: freq_scale = 1
+Dec 17 11:35:08 launchpad ollama[1572]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: graph nodes  = 1286
+Dec 17 11:35:08 launchpad ollama[1572]: llama_new_context_with_model: graph splits = 37
+Dec 17 11:35:08 launchpad ollama[861591]: INFO [main] model loaded | tid="140081273618432" timestamp=1734464108
+Dec 17 11:35:08 launchpad ollama[1572]: time=2024-12-17T11:35:08.675-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.33 seconds"
+Dec 17 11:35:08 launchpad ollama[861591]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1770 n_keep=4 n_left=2044 n_shift=1022 tid="140081273618432" timestamp=1734464108
+Dec 17 11:35:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:35:54 | 200 | 47.721660674s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:35:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:35:54 | 200 |    3.652541ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:35:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:35:54 | 200 |    3.506657ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:35:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:35:54 | 200 |    3.281126ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:35:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:35:54 | 200 |     3.26924ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:37:25 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:37:25 | 200 |    5.516592ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:38:26 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:38:26 | 200 |    5.230736ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:38:26 launchpad ollama[1572]: time=2024-12-17T11:38:26.276-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:39:19 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:39:19 | 200 | 53.267199828s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:39:19 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:39:19 | 200 |    3.515037ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:39:19 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:39:19 | 200 |    3.598043ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:39:19 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:39:19 | 200 |    3.475272ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:39:19 launchpad ollama[1572]: time=2024-12-17T11:39:19.558-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:39:20 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:39:20 | 200 |  1.191025203s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:39:25 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:39:25 | 200 |     3.73684ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:39:25 launchpad ollama[1572]: time=2024-12-17T11:39:25.729-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:40:06 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:06 | 200 | 40.467856449s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:40:06 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:06 | 200 |    3.527293ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:06 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:06 | 200 |    3.659195ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:06 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:06 | 200 |    3.675467ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:06 launchpad ollama[1572]: time=2024-12-17T11:40:06.211-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:40:09 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:09 | 200 |  3.138858564s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:40:09 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:09 | 200 |    3.384973ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:09 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:09 | 200 |    3.268885ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:09 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:09 | 200 |    3.559398ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:09 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:09 | 200 |    3.132726ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:29 launchpad ollama[1572]: time=2024-12-17T11:40:29.191-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:40:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:30 | 200 |  1.366045992s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:40:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:30 | 200 |    3.336638ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:30 | 200 |    4.127728ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:30 | 200 |    3.251137ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:40:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:40:30 | 200 |    3.557758ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:41:08 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:41:08 | 200 |    3.341518ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:41:08 launchpad ollama[1572]: time=2024-12-17T11:41:08.635-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:41:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:41:49 | 200 | 40.868745177s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:41:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:41:49 | 200 |    3.604762ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:41:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:41:49 | 200 |    3.673155ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:41:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:41:49 | 200 |    3.075599ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:41:56 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:41:56 | 200 |    3.757316ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:42:06 launchpad ollama[1572]: time=2024-12-17T11:42:06.494-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:42:10 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:42:10 | 200 |  4.067280721s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:42:10 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:42:10 | 200 |    3.465717ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:42:10 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:42:10 | 200 |    3.240047ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:42:10 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:42:10 | 200 |    3.442517ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:42:10 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:42:10 | 200 |    3.204584ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:42:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:42:49 | 200 |    3.576072ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:42:49 launchpad ollama[1572]: time=2024-12-17T11:42:49.492-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:43:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:43:30 | 200 | 41.241049981s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:43:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:43:30 | 200 |    3.347869ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:43:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:43:30 | 200 |    3.544009ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:43:30 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:43:30 | 200 |    3.414563ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:43:30 launchpad ollama[1572]: time=2024-12-17T11:43:30.749-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:43:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:43:34 | 200 |  3.582464741s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:43:50 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:43:50 | 200 |    3.480633ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:43:50 launchpad ollama[1572]: time=2024-12-17T11:43:50.733-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:44:32 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:32 | 200 | 41.271786737s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:44:32 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:32 | 200 |    3.624681ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:32 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:32 | 200 |     3.67793ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:32 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:32 | 200 |    3.188362ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:39 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:39 | 200 |    3.511368ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:39 launchpad ollama[1572]: time=2024-12-17T11:44:39.347-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:44:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:54 | 200 | 15.438769121s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:44:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:54 | 200 |    3.592361ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:54 | 200 |    3.670274ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:54 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:44:54 | 200 |    3.345963ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:44:54 launchpad ollama[1572]: time=2024-12-17T11:44:54.800-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:45:00 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:00 | 200 |  5.481096374s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:45:00 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:00 | 200 |    3.271702ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:00 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:00 | 200 |    3.708494ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:00 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:00 | 200 |    3.164169ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:00 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:00 | 200 |    3.565407ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:22 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:22 | 200 |    3.165493ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:22 launchpad ollama[1572]: time=2024-12-17T11:45:22.498-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:45:38 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:38 | 200 | 16.016094349s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:45:38 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:38 | 200 |    3.815723ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:38 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:38 | 200 |    4.102251ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:38 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:38 | 200 |    3.555883ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:38 launchpad ollama[1572]: time=2024-12-17T11:45:38.529-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:45:44 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:44 | 200 |  5.701122409s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:45:44 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:44 | 200 |    3.197254ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:44 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:44 | 200 |     4.23866ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:44 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:44 | 200 |    3.227173ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:45:44 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:45:44 | 200 |    3.768713ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:46:34 launchpad ollama[1572]: time=2024-12-17T11:46:34.412-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:46:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:46:51 | 200 | 16.986524599s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:46:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:46:51 | 200 |    3.446694ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:46:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:46:51 | 200 |    3.733003ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:46:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:46:51 | 200 |    3.529409ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:46:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:46:51 | 200 |    3.270679ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:17 launchpad ollama[1572]: time=2024-12-17T11:47:17.449-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:47:35 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:35 | 200 | 18.019481659s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:47:35 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:35 | 200 |    3.252731ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:35 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:35 | 200 |    3.452773ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:35 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:35 | 200 |    4.092426ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:35 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:35 | 200 |    3.808774ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:35 launchpad ollama[1572]: time=2024-12-17T11:47:35.550-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:47:55 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:55 | 200 | 20.396254743s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:47:55 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:55 | 200 |    3.635684ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:55 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:55 | 200 |    4.415975ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:55 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:55 | 200 |    3.151721ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:55 launchpad ollama[1572]: time=2024-12-17T11:47:55.961-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:47:59 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:59 | 200 |  3.667295825s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:47:59 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:59 | 200 |    3.299418ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:59 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:59 | 200 |    3.864437ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:59 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:59 | 200 |    3.347656ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:47:59 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:47:59 | 200 |     3.92988ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:14 launchpad ollama[1572]: time=2024-12-17T11:48:14.428-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:48:16 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:16 | 200 |  2.063212295s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:48:16 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:16 | 200 |    4.151178ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:16 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:16 | 200 |     4.39643ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:16 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:16 | 200 |    3.542621ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:16 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:16 | 200 |     3.62247ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:36 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:36 | 200 |    3.239243ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:36 launchpad ollama[1572]: time=2024-12-17T11:48:36.715-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:48:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:51 | 200 | 14.353676494s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:48:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:51 | 200 |    3.653713ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:51 | 200 |    3.690094ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:51 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:51 | 200 |    3.365176ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:53 launchpad ollama[1572]: time=2024-12-17T11:48:53.271-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:48:56 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:56 | 200 |  3.489522935s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:48:56 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:56 | 200 |    3.386586ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:56 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:56 | 200 |    3.622132ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:56 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:56 | 200 |    3.136189ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:48:56 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:48:56 | 200 |    3.611936ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:32 launchpad ollama[1572]: time=2024-12-17T11:49:32.523-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:49:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:34 | 200 |  2.008228468s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:49:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:34 | 200 |    3.377142ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:34 | 200 |    4.045894ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:34 | 200 |    3.352814ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:34 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:34 | 200 |    3.276895ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:37 launchpad ollama[1572]: time=2024-12-17T11:49:37.920-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:49:39 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:39 | 200 |  1.999980407s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:49:39 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:39 | 200 |    3.737004ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:39 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:39 | 200 |    3.274788ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:39 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:39 | 200 |    3.274432ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:39 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:39 | 200 |    3.476006ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:40 launchpad ollama[1572]: time=2024-12-17T11:49:40.829-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:49:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:42 | 200 |    1.9710455s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:49:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:42 | 200 |    3.380483ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:42 | 200 |    3.873673ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:42 | 200 |    3.723248ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:42 | 200 |    3.105496ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:43 launchpad ollama[1572]: time=2024-12-17T11:49:43.735-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:49:45 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:45 | 200 |  1.994826533s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:49:45 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:45 | 200 |    3.444324ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:45 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:45 | 200 |    3.758586ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:45 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:45 | 200 |    3.219782ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:49:45 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:49:45 | 200 |    3.715591ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:06 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:06 | 200 |    3.307249ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:10 launchpad ollama[1572]: time=2024-12-17T11:50:10.716-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:50:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:12 | 200 |  2.026880775s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:50:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:12 | 200 |    3.921184ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:12 | 200 |    3.474571ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:12 | 200 |    3.714548ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:12 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:12 | 200 |      3.2587ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:31 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:31 | 404 |     124.129µs |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:42 | 404 |     182.392µs |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:50:49 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:49 | 200 |    3.630645ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:51 launchpad ollama[1572]: time=2024-12-17T11:50:51.656-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:50:53 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:53 | 200 |   2.04381689s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:50:53 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:53 | 200 |    3.437339ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:53 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:53 | 200 |    3.903615ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:53 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:53 | 200 |    3.786296ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:50:53 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:50:53 | 200 |    3.416089ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:52:37 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:52:37 | 200 |    3.244939ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:52:39 launchpad ollama[1572]: time=2024-12-17T11:52:39.950-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:52:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:52:42 | 200 |  2.653646714s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:52:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:52:42 | 200 |    4.802332ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:52:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:52:42 | 200 |    3.907449ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:52:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:52:42 | 200 |     3.30092ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:52:42 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:52:42 | 200 |    3.159054ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:21 launchpad ollama[1572]: time=2024-12-17T11:53:21.476-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:53:24 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:24 | 200 |  2.793217621s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:53:24 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:24 | 200 |    3.277695ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:24 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:24 | 200 |    3.341861ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:24 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:24 | 200 |    3.564028ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:24 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:24 | 200 |    3.194388ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:33 launchpad ollama[1572]: time=2024-12-17T11:53:33.939-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:53:36 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:36 | 200 |  2.801734217s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:53:36 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:36 | 200 |    3.472244ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:36 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:36 | 200 |    4.063404ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:36 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:36 | 200 |    3.232822ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:53:36 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:53:36 | 200 |    3.692176ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:54:30 launchpad ollama[1572]: time=2024-12-17T11:54:30.250-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:54:33 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:54:33 | 200 |   2.82432102s |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:54:33 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:54:33 | 200 |    3.682781ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:54:33 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:54:33 | 200 |    3.719997ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:54:33 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:54:33 | 200 |     3.65298ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:54:33 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:54:33 | 200 |    3.451804ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:55:01 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:55:01 | 200 |      14.915µs |       127.0.0.1 | HEAD     "/"
+Dec 17 11:55:01 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:55:01 | 200 |     3.19846ms |       127.0.0.1 | POST     "/api/show"
+Dec 17 11:55:01 launchpad ollama[1572]: time=2024-12-17T11:55:01.163-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:55:01 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:55:01 | 200 |    3.797903ms |       127.0.0.1 | POST     "/api/generate"
+Dec 17 11:55:38 launchpad ollama[1572]: time=2024-12-17T11:55:38.153-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:55:41 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:55:41 | 200 |  3.221294985s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:56:17 launchpad ollama[1572]: time=2024-12-17T11:56:17.217-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:56:25 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:56:25 | 200 |  8.634351008s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:56:34 launchpad ollama[1572]: time=2024-12-17T11:56:34.255-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:56:40 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:56:40 | 200 |  6.712967398s |       127.0.0.1 | POST     "/api/chat"
+Dec 17 11:57:16 launchpad ollama[1572]: time=2024-12-17T11:57:16.185-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 17 11:57:28 launchpad ollama[1572]: [GIN] 2024/12/17 - 11:57:28 | 200 | 12.770291401s |       127.0.0.1 | POST     "/api/chat"
+-- Boot ec1135dab11f474ca5286a2fef056c09 --
+Dec 17 18:00:58 launchpad systemd[1]: Starting Server for local large language models...
+Dec 17 18:00:58 launchpad systemd[1]: Started Server for local large language models.
+Dec 17 18:00:58 launchpad ollama[1620]: 2024/12/17 18:00:58 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 17 18:00:58 launchpad ollama[1620]: time=2024-12-17T18:00:58.689-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 17 18:00:58 launchpad ollama[1620]: time=2024-12-17T18:00:58.693-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 17 18:00:58 launchpad ollama[1620]: time=2024-12-17T18:00:58.693-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 17 18:00:58 launchpad ollama[1620]: time=2024-12-17T18:00:58.695-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2831793936/runners
+Dec 17 18:01:01 launchpad ollama[1620]: time=2024-12-17T18:01:01.688-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 17 18:01:01 launchpad ollama[1620]: time=2024-12-17T18:01:01.688-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 17 18:01:01 launchpad ollama[1620]: time=2024-12-17T18:01:01.688-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 17 18:01:01 launchpad ollama[1620]: time=2024-12-17T18:01:01.688-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 17 18:01:01 launchpad ollama[1620]: time=2024-12-17T18:01:01.688-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 17 18:01:01 launchpad ollama[1620]: time=2024-12-17T18:01:01.892-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 18 08:26:17 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:26:17 | 200 |     364.657µs |       127.0.0.1 | HEAD     "/"
+Dec 18 08:26:17 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:26:17 | 200 |    5.992374ms |       127.0.0.1 | POST     "/api/show"
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.561-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.698-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10250158080 required="9.2 GiB"
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.698-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.4 GiB" free_swap="68.9 GiB"
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.698-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.699-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2831793936/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 38309"
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.700-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.700-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.700-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 18 08:26:17 launchpad ollama[7745]: INFO [main] build info | build=0 commit="unknown" tid="140601025531904" timestamp=1734539177
+Dec 18 08:26:17 launchpad ollama[7745]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140601025531904" timestamp=1734539177 total_threads=16
+Dec 18 08:26:17 launchpad ollama[7745]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38309" tid="140601025531904" timestamp=1734539177
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - type  f32:   81 tensors
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - type q4_0:  281 tensors
+Dec 18 08:26:17 launchpad ollama[1620]: llama_model_loader: - type q6_K:    1 tensors
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_vocab: special tokens cache size = 3
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: format           = GGUF V2
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: arch             = llama
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: vocab type       = SPM
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_vocab          = 32016
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_merges         = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: vocab_only       = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_embd           = 5120
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_layer          = 40
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_head           = 40
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_head_kv        = 40
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_rot            = 128
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_swa            = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_gqa            = 1
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_ff             = 13824
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_expert         = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_expert_used    = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: causal attn      = 1
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: pooling type     = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: rope type        = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: rope scaling     = linear
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: freq_scale_train = 1
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: ssm_d_state      = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: model type       = 13B
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: model ftype      = Q4_0
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: model params     = 13.02 B
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: general.name     = codellama
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: BOS token        = 1 ''
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: EOS token        = 2 ''
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: UNK token        = 0 ''
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_print_meta: max token length = 48
+Dec 18 08:26:17 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 18 08:26:17 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 18 08:26:17 launchpad ollama[1620]: ggml_cuda_init: found 1 CUDA devices:
+Dec 18 08:26:17 launchpad ollama[1620]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 18 08:26:17 launchpad ollama[1620]: time=2024-12-18T08:26:17.950-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 18 08:26:17 launchpad ollama[1620]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 18 08:26:25 launchpad ollama[1620]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 18 08:26:25 launchpad ollama[1620]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 18 08:26:25 launchpad ollama[1620]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 18 08:26:25 launchpad ollama[1620]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 18 08:26:25 launchpad ollama[1620]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: n_ctx      = 2048
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: n_batch    = 512
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: n_ubatch   = 512
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: flash_attn = 0
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: freq_scale = 1
+Dec 18 08:26:26 launchpad ollama[1620]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: graph nodes  = 1286
+Dec 18 08:26:26 launchpad ollama[1620]: llama_new_context_with_model: graph splits = 2
+Dec 18 08:26:26 launchpad ollama[7745]: INFO [main] model loaded | tid="140601025531904" timestamp=1734539186
+Dec 18 08:26:26 launchpad ollama[1620]: time=2024-12-18T08:26:26.480-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.78 seconds"
+Dec 18 08:26:26 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:26:26 | 200 |  8.923147347s |       127.0.0.1 | POST     "/api/generate"
+Dec 18 08:26:45 launchpad ollama[1620]: time=2024-12-18T08:26:45.161-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:26:57 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:26:57 | 200 | 12.317376986s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.204-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.349-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10250158080 required="9.2 GiB"
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.349-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.4 GiB" free_swap="68.9 GiB"
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.349-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.350-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2831793936/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 34313"
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.350-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.350-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.351-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 18 08:33:04 launchpad ollama[7836]: INFO [main] build info | build=0 commit="unknown" tid="140706388664320" timestamp=1734539584
+Dec 18 08:33:04 launchpad ollama[7836]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140706388664320" timestamp=1734539584 total_threads=16
+Dec 18 08:33:04 launchpad ollama[7836]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34313" tid="140706388664320" timestamp=1734539584
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - type  f32:   81 tensors
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - type q4_0:  281 tensors
+Dec 18 08:33:04 launchpad ollama[1620]: llama_model_loader: - type q6_K:    1 tensors
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_vocab: special tokens cache size = 3
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: format           = GGUF V2
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: arch             = llama
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: vocab type       = SPM
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_vocab          = 32016
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_merges         = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: vocab_only       = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_embd           = 5120
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_layer          = 40
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_head           = 40
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_head_kv        = 40
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_rot            = 128
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_swa            = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_gqa            = 1
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_ff             = 13824
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_expert         = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_expert_used    = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: causal attn      = 1
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: pooling type     = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: rope type        = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: rope scaling     = linear
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: freq_scale_train = 1
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: ssm_d_state      = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: model type       = 13B
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: model ftype      = Q4_0
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: model params     = 13.02 B
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: general.name     = codellama
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: BOS token        = 1 ''
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: EOS token        = 2 ''
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: UNK token        = 0 ''
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_print_meta: max token length = 48
+Dec 18 08:33:04 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 18 08:33:04 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 18 08:33:04 launchpad ollama[1620]: ggml_cuda_init: found 1 CUDA devices:
+Dec 18 08:33:04 launchpad ollama[1620]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 18 08:33:04 launchpad ollama[1620]: time=2024-12-18T08:33:04.634-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 18 08:33:04 launchpad ollama[1620]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: n_ctx      = 2048
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: n_batch    = 512
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: n_ubatch   = 512
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: flash_attn = 0
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: freq_scale = 1
+Dec 18 08:33:05 launchpad ollama[1620]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: graph nodes  = 1286
+Dec 18 08:33:05 launchpad ollama[1620]: llama_new_context_with_model: graph splits = 2
+Dec 18 08:33:05 launchpad ollama[7836]: INFO [main] model loaded | tid="140706388664320" timestamp=1734539585
+Dec 18 08:33:05 launchpad ollama[1620]: time=2024-12-18T08:33:05.637-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 18 08:33:18 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:33:18 | 200 |  14.22103974s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.273-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.419-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9966452736 required="9.2 GiB"
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.419-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.3 GiB" free_swap="68.9 GiB"
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.420-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.421-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2831793936/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 36331"
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.421-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.421-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.421-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 18 08:44:33 launchpad ollama[8201]: INFO [main] build info | build=0 commit="unknown" tid="140122786766848" timestamp=1734540273
+Dec 18 08:44:33 launchpad ollama[8201]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140122786766848" timestamp=1734540273 total_threads=16
+Dec 18 08:44:33 launchpad ollama[8201]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36331" tid="140122786766848" timestamp=1734540273
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - type  f32:   81 tensors
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - type q4_0:  281 tensors
+Dec 18 08:44:33 launchpad ollama[1620]: llama_model_loader: - type q6_K:    1 tensors
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_vocab: special tokens cache size = 3
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: format           = GGUF V2
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: arch             = llama
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: vocab type       = SPM
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_vocab          = 32016
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_merges         = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: vocab_only       = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_embd           = 5120
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_layer          = 40
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_head           = 40
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_head_kv        = 40
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_rot            = 128
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_swa            = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_gqa            = 1
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_ff             = 13824
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_expert         = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_expert_used    = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: causal attn      = 1
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: pooling type     = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: rope type        = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: rope scaling     = linear
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: freq_scale_train = 1
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: ssm_d_state      = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: model type       = 13B
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: model ftype      = Q4_0
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: model params     = 13.02 B
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: general.name     = codellama
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: BOS token        = 1 ''
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: EOS token        = 2 ''
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: UNK token        = 0 ''
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_print_meta: max token length = 48
+Dec 18 08:44:33 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 18 08:44:33 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 18 08:44:33 launchpad ollama[1620]: ggml_cuda_init: found 1 CUDA devices:
+Dec 18 08:44:33 launchpad ollama[1620]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 18 08:44:33 launchpad ollama[1620]: time=2024-12-18T08:44:33.709-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 18 08:44:33 launchpad ollama[1620]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: n_ctx      = 2048
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: n_batch    = 512
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: n_ubatch   = 512
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: flash_attn = 0
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: freq_scale = 1
+Dec 18 08:44:34 launchpad ollama[1620]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: graph nodes  = 1286
+Dec 18 08:44:34 launchpad ollama[1620]: llama_new_context_with_model: graph splits = 2
+Dec 18 08:44:34 launchpad ollama[8201]: INFO [main] model loaded | tid="140122786766848" timestamp=1734540274
+Dec 18 08:44:34 launchpad ollama[1620]: time=2024-12-18T08:44:34.713-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 18 08:44:47 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:44:47 | 200 | 14.344121562s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 08:45:12 launchpad ollama[1620]: time=2024-12-18T08:45:12.862-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:45:23 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:45:23 | 200 | 10.164273531s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.020-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.164-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9984737280 required="9.2 GiB"
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.164-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.3 GiB" free_swap="68.9 GiB"
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.165-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.166-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2831793936/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 36597"
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.166-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.166-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.166-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 18 08:51:09 launchpad ollama[8399]: INFO [main] build info | build=0 commit="unknown" tid="140087951147008" timestamp=1734540669
+Dec 18 08:51:09 launchpad ollama[8399]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140087951147008" timestamp=1734540669 total_threads=16
+Dec 18 08:51:09 launchpad ollama[8399]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36597" tid="140087951147008" timestamp=1734540669
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - type  f32:   81 tensors
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - type q4_0:  281 tensors
+Dec 18 08:51:09 launchpad ollama[1620]: llama_model_loader: - type q6_K:    1 tensors
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_vocab: special tokens cache size = 3
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: format           = GGUF V2
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: arch             = llama
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: vocab type       = SPM
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_vocab          = 32016
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_merges         = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: vocab_only       = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_embd           = 5120
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_layer          = 40
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_head           = 40
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_head_kv        = 40
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_rot            = 128
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_swa            = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_gqa            = 1
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_ff             = 13824
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_expert         = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_expert_used    = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: causal attn      = 1
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: pooling type     = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: rope type        = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: rope scaling     = linear
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: freq_scale_train = 1
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: ssm_d_state      = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: model type       = 13B
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: model ftype      = Q4_0
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: model params     = 13.02 B
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: general.name     = codellama
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: BOS token        = 1 ''
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: EOS token        = 2 ''
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: UNK token        = 0 ''
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_print_meta: max token length = 48
+Dec 18 08:51:09 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 18 08:51:09 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 18 08:51:09 launchpad ollama[1620]: ggml_cuda_init: found 1 CUDA devices:
+Dec 18 08:51:09 launchpad ollama[1620]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 18 08:51:09 launchpad ollama[1620]: time=2024-12-18T08:51:09.462-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 18 08:51:09 launchpad ollama[1620]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: n_ctx      = 2048
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: n_batch    = 512
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: n_ubatch   = 512
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: flash_attn = 0
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: freq_scale = 1
+Dec 18 08:51:10 launchpad ollama[1620]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: graph nodes  = 1286
+Dec 18 08:51:10 launchpad ollama[1620]: llama_new_context_with_model: graph splits = 2
+Dec 18 08:51:10 launchpad ollama[8399]: INFO [main] model loaded | tid="140087951147008" timestamp=1734540670
+Dec 18 08:51:10 launchpad ollama[1620]: time=2024-12-18T08:51:10.466-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Dec 18 08:51:29 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:51:29 | 200 | 20.344841658s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 08:54:35 launchpad ollama[1620]: time=2024-12-18T08:54:35.242-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:54:44 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:54:44 | 200 |  8.979218815s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 08:58:06 launchpad ollama[1620]: time=2024-12-18T08:58:06.634-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 08:58:27 launchpad ollama[1620]: [GIN] 2024/12/18 - 08:58:27 | 200 | 20.973919252s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 09:02:54 launchpad ollama[1620]: time=2024-12-18T09:02:54.017-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 09:03:16 launchpad ollama[1620]: [GIN] 2024/12/18 - 09:03:16 | 200 | 22.938191365s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 09:04:24 launchpad ollama[1620]: time=2024-12-18T09:04:24.837-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 09:04:48 launchpad ollama[1620]: [GIN] 2024/12/18 - 09:04:48 | 200 | 23.184344856s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 09:09:11 launchpad ollama[1620]: time=2024-12-18T09:09:11.966-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 09:09:30 launchpad ollama[1620]: [GIN] 2024/12/18 - 09:09:30 | 200 | 18.907398692s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 09:11:49 launchpad ollama[1620]: time=2024-12-18T09:11:49.132-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 09:12:06 launchpad ollama[1620]: [GIN] 2024/12/18 - 09:12:06 | 200 | 17.640043768s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 09:14:31 launchpad ollama[1620]: time=2024-12-18T09:14:31.413-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 09:14:44 launchpad ollama[1620]: [GIN] 2024/12/18 - 09:14:44 | 200 | 12.874644777s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 09:16:40 launchpad ollama[1620]: time=2024-12-18T09:16:40.421-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 09:16:46 launchpad ollama[1620]: [GIN] 2024/12/18 - 09:16:46 | 200 |  6.071586475s |       127.0.0.1 | POST     "/api/chat"
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.105-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.251-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.8 GiB" free_swap="68.9 GiB"
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.252-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.253-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2831793936/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 33937"
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.253-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.253-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.253-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 18 13:16:12 launchpad ollama[12110]: INFO [main] build info | build=0 commit="unknown" tid="139748038270976" timestamp=1734556572
+Dec 18 13:16:12 launchpad ollama[12110]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139748038270976" timestamp=1734556572 total_threads=16
+Dec 18 13:16:12 launchpad ollama[12110]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33937" tid="139748038270976" timestamp=1734556572
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - type  f32:   81 tensors
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - type q4_0:  281 tensors
+Dec 18 13:16:12 launchpad ollama[1620]: llama_model_loader: - type q6_K:    1 tensors
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_vocab: special tokens cache size = 3
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: format           = GGUF V2
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: arch             = llama
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: vocab type       = SPM
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_vocab          = 32016
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_merges         = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: vocab_only       = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_embd           = 5120
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_layer          = 40
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_head           = 40
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_head_kv        = 40
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_rot            = 128
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_swa            = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_gqa            = 1
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_ff             = 13824
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_expert         = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_expert_used    = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: causal attn      = 1
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: pooling type     = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: rope type        = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: rope scaling     = linear
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: freq_scale_train = 1
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: ssm_d_state      = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: model type       = 13B
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: model ftype      = Q4_0
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: model params     = 13.02 B
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: general.name     = codellama
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: BOS token        = 1 ''
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: EOS token        = 2 ''
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: UNK token        = 0 ''
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_print_meta: max token length = 48
+Dec 18 13:16:12 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 18 13:16:12 launchpad ollama[1620]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 18 13:16:12 launchpad ollama[1620]: ggml_cuda_init: found 1 CUDA devices:
+Dec 18 13:16:12 launchpad ollama[1620]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 18 13:16:12 launchpad ollama[1620]: time=2024-12-18T13:16:12.543-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_tensors: offloading 38 repeating layers to GPU
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_tensors: offloaded 38/41 layers to GPU
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 18 13:16:12 launchpad ollama[1620]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: n_ctx      = 2048
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: n_batch    = 512
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: n_ubatch   = 512
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: flash_attn = 0
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: freq_scale = 1
+Dec 18 13:16:13 launchpad ollama[1620]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: graph nodes  = 1286
+Dec 18 13:16:13 launchpad ollama[1620]: llama_new_context_with_model: graph splits = 26
+Dec 18 13:16:13 launchpad ollama[12110]: INFO [main] model loaded | tid="139748038270976" timestamp=1734556573
+Dec 18 13:16:13 launchpad ollama[1620]: time=2024-12-18T13:16:13.547-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Dec 18 13:16:24 launchpad ollama[1620]: [GIN] 2024/12/18 - 13:16:24 | 200 | 12.052254895s |       127.0.0.1 | POST     "/api/chat"
+Dec 22 18:30:50 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 22 18:30:52 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 22 18:30:52 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 22 18:30:52 launchpad systemd[1]: ollama.service: Consumed 4min 24.320s CPU time, 8.1G memory peak, 7.1G read from disk, 508.1M written to disk.
+-- Boot e41844aa9c074591b27e1c5a25e18d3c --
+Dec 23 07:45:09 launchpad systemd[1]: Starting Server for local large language models...
+Dec 23 07:45:09 launchpad systemd[1]: Started Server for local large language models.
+Dec 23 07:45:09 launchpad ollama[1579]: 2024/12/23 07:45:09 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 23 07:45:09 launchpad ollama[1579]: time=2024-12-23T07:45:09.464-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 23 07:45:09 launchpad ollama[1579]: time=2024-12-23T07:45:09.468-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 23 07:45:09 launchpad ollama[1579]: time=2024-12-23T07:45:09.469-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 23 07:45:09 launchpad ollama[1579]: time=2024-12-23T07:45:09.471-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2172411424/runners
+Dec 23 07:45:12 launchpad ollama[1579]: time=2024-12-23T07:45:12.466-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Dec 23 07:45:12 launchpad ollama[1579]: time=2024-12-23T07:45:12.467-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 23 07:45:12 launchpad ollama[1579]: time=2024-12-23T07:45:12.467-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 23 07:45:12 launchpad ollama[1579]: time=2024-12-23T07:45:12.467-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 23 07:45:12 launchpad ollama[1579]: time=2024-12-23T07:45:12.467-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 23 07:45:12 launchpad ollama[1579]: time=2024-12-23T07:45:12.676-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 26 08:40:01 launchpad ollama[1579]: [GIN] 2024/12/26 - 08:40:01 | 200 |     318.329µs |       127.0.0.1 | HEAD     "/"
+Dec 26 08:40:01 launchpad ollama[1579]: [GIN] 2024/12/26 - 08:40:01 | 200 |   18.324665ms |       127.0.0.1 | POST     "/api/show"
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.674-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9601089536 required="6.2 GiB"
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.674-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.5 GiB" free_swap="68.9 GiB"
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.674-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.676-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2172411424/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39893"
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.676-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.676-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.676-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 26 08:40:01 launchpad ollama[25550]: INFO [main] build info | build=0 commit="unknown" tid="140213899902976" timestamp=1735231201
+Dec 26 08:40:01 launchpad ollama[25550]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140213899902976" timestamp=1735231201 total_threads=16
+Dec 26 08:40:01 launchpad ollama[25550]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39893" tid="140213899902976" timestamp=1735231201
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 26 08:40:01 launchpad ollama[1579]: time=2024-12-26T08:40:01.928-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - type  f32:   65 tensors
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - type q4_0:  225 tensors
+Dec 26 08:40:01 launchpad ollama[1579]: llama_model_loader: - type q6_K:    1 tensors
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_vocab: special tokens cache size = 256
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: arch             = llama
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: vocab type       = BPE
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_vocab          = 128256
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_merges         = 280147
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: vocab_only       = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_embd           = 4096
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_layer          = 32
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_head           = 32
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_head_kv        = 8
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_rot            = 128
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_swa            = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_gqa            = 4
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_ff             = 14336
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_expert         = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_expert_used    = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: causal attn      = 1
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: pooling type     = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: rope type        = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: rope scaling     = linear
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: freq_scale_train = 1
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: ssm_d_state      = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: model type       = 8B
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: model ftype      = Q4_0
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: model params     = 8.03 B
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_print_meta: max token length = 256
+Dec 26 08:40:02 launchpad ollama[1579]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 26 08:40:02 launchpad ollama[1579]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 26 08:40:02 launchpad ollama[1579]: ggml_cuda_init: found 1 CUDA devices:
+Dec 26 08:40:02 launchpad ollama[1579]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 26 08:40:02 launchpad ollama[1579]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 26 08:40:06 launchpad ollama[1579]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 26 08:40:06 launchpad ollama[1579]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 26 08:40:06 launchpad ollama[1579]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 26 08:40:06 launchpad ollama[1579]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 26 08:40:06 launchpad ollama[1579]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: n_ctx      = 8192
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: n_batch    = 512
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: n_ubatch   = 512
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: flash_attn = 0
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: freq_scale = 1
+Dec 26 08:40:07 launchpad ollama[1579]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: graph nodes  = 1030
+Dec 26 08:40:07 launchpad ollama[1579]: llama_new_context_with_model: graph splits = 2
+Dec 26 08:40:07 launchpad ollama[25550]: INFO [main] model loaded | tid="140213899902976" timestamp=1735231207
+Dec 26 08:40:07 launchpad ollama[1579]: time=2024-12-26T08:40:07.447-08:00 level=INFO source=server.go:626 msg="llama runner started in 5.77 seconds"
+Dec 26 08:40:07 launchpad ollama[1579]: [GIN] 2024/12/26 - 08:40:07 | 200 |   5.95389214s |       127.0.0.1 | POST     "/api/generate"
+Dec 26 08:44:08 launchpad ollama[1579]: [GIN] 2024/12/26 - 08:44:08 | 200 |  5.773063495s |       127.0.0.1 | POST     "/api/chat"
+Dec 26 08:45:16 launchpad ollama[1579]: [GIN] 2024/12/26 - 08:45:16 | 200 |  3.236343889s |       127.0.0.1 | POST     "/api/chat"
+Dec 26 08:46:04 launchpad ollama[1579]: [GIN] 2024/12/26 - 08:46:04 | 200 |  1.811321635s |       127.0.0.1 | POST     "/api/chat"
+-- Boot 59e4e91c265e46c5b4d70cfcce7e99e2 --
+Dec 26 14:11:39 launchpad systemd[1]: Starting Server for local large language models...
+Dec 26 14:11:39 launchpad systemd[1]: Started Server for local large language models.
+Dec 26 14:11:39 launchpad ollama[1607]: 2024/12/26 14:11:39 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 26 14:11:40 launchpad ollama[1607]: time=2024-12-26T14:11:40.003-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 26 14:11:40 launchpad ollama[1607]: time=2024-12-26T14:11:40.007-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 26 14:11:40 launchpad ollama[1607]: time=2024-12-26T14:11:40.008-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 26 14:11:40 launchpad ollama[1607]: time=2024-12-26T14:11:40.010-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2730878161/runners
+Dec 26 14:11:43 launchpad ollama[1607]: time=2024-12-26T14:11:43.015-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Dec 26 14:11:43 launchpad ollama[1607]: time=2024-12-26T14:11:43.016-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 26 14:11:43 launchpad ollama[1607]: time=2024-12-26T14:11:43.016-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 26 14:11:43 launchpad ollama[1607]: time=2024-12-26T14:11:43.016-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 26 14:11:43 launchpad ollama[1607]: time=2024-12-26T14:11:43.016-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 26 14:11:43 launchpad ollama[1607]: time=2024-12-26T14:11:43.248-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.6 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 27 14:38:43 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:38:43 | 404 |     505.897µs |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.875-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8789557248 required="6.2 GiB"
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.875-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.875-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.876-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2730878161/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35155"
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.876-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.876-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 14:38:50 launchpad ollama[1607]: time=2024-12-27T14:38:50.877-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 14:38:51 launchpad ollama[600432]: INFO [main] build info | build=0 commit="unknown" tid="140036355035136" timestamp=1735339131
+Dec 27 14:38:51 launchpad ollama[600432]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140036355035136" timestamp=1735339131 total_threads=16
+Dec 27 14:38:51 launchpad ollama[600432]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35155" tid="140036355035136" timestamp=1735339131
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - type  f32:   65 tensors
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - type q4_0:  225 tensors
+Dec 27 14:38:51 launchpad ollama[1607]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 14:38:51 launchpad ollama[1607]: time=2024-12-27T14:38:51.128-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_vocab: special tokens cache size = 256
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: arch             = llama
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: vocab type       = BPE
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_vocab          = 128256
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_merges         = 280147
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: vocab_only       = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_embd           = 4096
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_layer          = 32
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_head           = 32
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_head_kv        = 8
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_rot            = 128
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_swa            = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_gqa            = 4
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_ff             = 14336
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_expert         = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: causal attn      = 1
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: pooling type     = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: rope type        = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: rope scaling     = linear
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: model type       = 8B
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: model params     = 8.03 B
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_print_meta: max token length = 256
+Dec 27 14:38:51 launchpad ollama[1607]: ggml_cuda_init: failed to initialize CUDA: forward compatibility was attempted on non supported HW
+Dec 27 14:38:51 launchpad ollama[1607]: llm_load_tensors: ggml ctx size =    0.14 MiB
+Dec 27 14:38:56 launchpad ollama[1607]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 27 14:38:56 launchpad ollama[1607]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 14:38:56 launchpad ollama[1607]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 27 14:38:56 launchpad ollama[1607]: llm_load_tensors:        CPU buffer size =  4437.80 MiB
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: n_ctx      = 8192
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: n_batch    = 512
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: flash_attn = 0
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: freq_scale = 1
+Dec 27 14:38:56 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 1024.00 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:38:56 launchpad ollama[1607]: llama_kv_cache_init:        CPU KV buffer size =  1024.00 MiB
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 27 14:38:56 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 2.02 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model:        CPU  output buffer size =     2.02 MiB
+Dec 27 14:38:56 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 560.01 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model:  CUDA_Host compute buffer size =   560.01 MiB
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: graph nodes  = 1030
+Dec 27 14:38:56 launchpad ollama[1607]: llama_new_context_with_model: graph splits = 1
+Dec 27 14:38:56 launchpad ollama[600432]: INFO [main] model loaded | tid="140036355035136" timestamp=1735339136
+Dec 27 14:38:56 launchpad ollama[1607]: time=2024-12-27T14:38:56.898-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.02 seconds"
+Dec 27 14:39:52 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:39:52 | 200 |          1m1s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 14:44:57 launchpad ollama[1607]: time=2024-12-27T14:44:57.317-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.125426471 model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa
+Dec 27 14:44:57 launchpad ollama[1607]: time=2024-12-27T14:44:57.566-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.374853636 model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa
+Dec 27 14:44:57 launchpad ollama[1607]: time=2024-12-27T14:44:57.817-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.625394288 model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa
+Dec 27 14:51:17 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:51:17 | 404 |      79.667µs |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.169-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8752005120 required="6.2 GiB"
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.170-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.170-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.171-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2730878161/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44609"
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.171-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.171-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.171-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 14:51:36 launchpad ollama[601091]: INFO [main] build info | build=0 commit="unknown" tid="139846059413504" timestamp=1735339896
+Dec 27 14:51:36 launchpad ollama[601091]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139846059413504" timestamp=1735339896 total_threads=16
+Dec 27 14:51:36 launchpad ollama[601091]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44609" tid="139846059413504" timestamp=1735339896
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - type  f32:   65 tensors
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - type q4_0:  225 tensors
+Dec 27 14:51:36 launchpad ollama[1607]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_vocab: special tokens cache size = 256
+Dec 27 14:51:36 launchpad ollama[1607]: time=2024-12-27T14:51:36.422-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: arch             = llama
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: vocab type       = BPE
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_vocab          = 128256
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_merges         = 280147
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: vocab_only       = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_embd           = 4096
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_layer          = 32
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_head           = 32
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_head_kv        = 8
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_rot            = 128
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_swa            = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_gqa            = 4
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_ff             = 14336
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_expert         = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: causal attn      = 1
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: pooling type     = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: rope type        = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: rope scaling     = linear
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: model type       = 8B
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: model params     = 8.03 B
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_print_meta: max token length = 256
+Dec 27 14:51:36 launchpad ollama[1607]: ggml_cuda_init: failed to initialize CUDA: forward compatibility was attempted on non supported HW
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_tensors: ggml ctx size =    0.14 MiB
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 27 14:51:36 launchpad ollama[1607]: llm_load_tensors:        CPU buffer size =  4437.80 MiB
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: n_ctx      = 8192
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: n_batch    = 512
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: flash_attn = 0
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: freq_scale = 1
+Dec 27 14:51:36 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 1024.00 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:51:36 launchpad ollama[1607]: llama_kv_cache_init:        CPU KV buffer size =  1024.00 MiB
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 27 14:51:36 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 2.02 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model:        CPU  output buffer size =     2.02 MiB
+Dec 27 14:51:36 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 560.01 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model:  CUDA_Host compute buffer size =   560.01 MiB
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: graph nodes  = 1030
+Dec 27 14:51:36 launchpad ollama[1607]: llama_new_context_with_model: graph splits = 1
+Dec 27 14:51:37 launchpad ollama[601091]: INFO [main] model loaded | tid="139846059413504" timestamp=1735339897
+Dec 27 14:51:37 launchpad ollama[1607]: time=2024-12-27T14:51:37.175-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Dec 27 14:53:31 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:53:31 | 200 |         1m55s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 14:53:31 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:53:31 | 200 |   12.793602ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:53:31 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:53:31 | 200 |   12.590232ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:54:42 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:54:42 | 200 | 28.286425244s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 14:54:42 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:54:42 | 200 |   12.636286ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:54:42 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:54:42 | 200 |   13.119285ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:57:10 launchpad ollama[1607]: [GIN] 2024/12/27 - 14:57:10 | 404 |      67.309µs |       127.0.0.1 | POST     "/api/show"
+Dec 27 14:58:07 launchpad ollama[1607]: time=2024-12-27T14:58:07.619-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 14:58:07 launchpad ollama[1607]: time=2024-12-27T14:58:07.759-08:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="5.4 GiB"
+Dec 27 14:58:12 launchpad ollama[1607]: time=2024-12-27T14:58:12.882-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.121904144 model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.131-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.371374687 model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.138-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.138-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=35 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.140-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2730878161/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 35 --parallel 1 --port 45005"
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.140-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.140-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.140-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 14:58:13 launchpad ollama[602803]: INFO [main] build info | build=0 commit="unknown" tid="140670149808128" timestamp=1735340293
+Dec 27 14:58:13 launchpad ollama[602803]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140670149808128" timestamp=1735340293 total_threads=16
+Dec 27 14:58:13 launchpad ollama[602803]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45005" tid="140670149808128" timestamp=1735340293
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - type  f32:   81 tensors
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - type q4_0:  281 tensors
+Dec 27 14:58:13 launchpad ollama[1607]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_vocab: special tokens cache size = 3
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: format           = GGUF V2
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: arch             = llama
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: vocab type       = SPM
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_vocab          = 32016
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_merges         = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: vocab_only       = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_embd           = 5120
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_layer          = 40
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_head           = 40
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_head_kv        = 40
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_rot            = 128
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_swa            = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_gqa            = 1
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_ff             = 13824
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_expert         = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: causal attn      = 1
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: pooling type     = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: rope type        = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: rope scaling     = linear
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: model type       = 13B
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: model params     = 13.02 B
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: general.name     = codellama
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: BOS token        = 1 ''
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: EOS token        = 2 ''
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: UNK token        = 0 ''
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_print_meta: max token length = 48
+Dec 27 14:58:13 launchpad ollama[1607]: ggml_cuda_init: failed to initialize CUDA: forward compatibility was attempted on non supported HW
+Dec 27 14:58:13 launchpad ollama[1607]: llm_load_tensors: ggml ctx size =    0.17 MiB
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.382-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.621899773 model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa
+Dec 27 14:58:13 launchpad ollama[1607]: time=2024-12-27T14:58:13.390-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 14:58:20 launchpad ollama[1607]: llm_load_tensors: offloading 35 repeating layers to GPU
+Dec 27 14:58:20 launchpad ollama[1607]: llm_load_tensors: offloaded 35/41 layers to GPU
+Dec 27 14:58:20 launchpad ollama[1607]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Dec 27 14:58:20 launchpad ollama[1607]: llama_new_context_with_model: n_ctx      = 2048
+Dec 27 14:58:20 launchpad ollama[1607]: llama_new_context_with_model: n_batch    = 512
+Dec 27 14:58:20 launchpad ollama[1607]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 14:58:20 launchpad ollama[1607]: llama_new_context_with_model: flash_attn = 0
+Dec 27 14:58:20 launchpad ollama[1607]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 27 14:58:20 launchpad ollama[1607]: llama_new_context_with_model: freq_scale = 1
+Dec 27 14:58:20 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 1600.00 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:58:21 launchpad ollama[1607]: llama_kv_cache_init:        CPU KV buffer size =  1600.00 MiB
+Dec 27 14:58:21 launchpad ollama[1607]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 27 14:58:21 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 0.14 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:58:21 launchpad ollama[1607]: llama_new_context_with_model:        CPU  output buffer size =     0.14 MiB
+Dec 27 14:58:21 launchpad ollama[1607]: ggml_cuda_host_malloc: failed to allocate 204.01 MiB of pinned memory: forward compatibility was attempted on non supported HW
+Dec 27 14:58:21 launchpad ollama[1607]: llama_new_context_with_model:  CUDA_Host compute buffer size =   204.01 MiB
+Dec 27 14:58:21 launchpad ollama[1607]: llama_new_context_with_model: graph nodes  = 1286
+Dec 27 14:58:21 launchpad ollama[1607]: llama_new_context_with_model: graph splits = 1
+Dec 27 14:58:21 launchpad ollama[602803]: INFO [main] model loaded | tid="140670149808128" timestamp=1735340301
+Dec 27 14:58:21 launchpad ollama[1607]: time=2024-12-27T14:58:21.675-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.53 seconds"
+Dec 27 14:58:21 launchpad ollama[602803]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=165165 n_keep=4 n_left=2044 n_shift=1022 tid="140670149808128" timestamp=1735340301
+Dec 27 15:00:54 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:00:54 | 200 |         2m47s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:00:54 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:00:54 | 200 |    3.849651ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:00:54 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:00:54 | 200 |    3.592242ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:03:20 launchpad ollama[1607]: time=2024-12-27T15:03:20.680-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:03:20 launchpad ollama[602803]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=165341 n_keep=4 n_left=2044 n_shift=1022 tid="140670149808128" timestamp=1735340600
+Dec 27 15:05:12 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:05:12 | 200 |         1m51s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:08:45 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:08:45 | 200 |    1.617502ms |       127.0.0.1 | GET      "/api/tags"
+Dec 27 15:09:44 launchpad ollama[1607]: time=2024-12-27T15:09:44.086-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:11:12 launchpad ollama[1607]: time=2024-12-27T15:11:12.315-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:12:14 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:12:14 | 200 |         2m30s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:14:33 launchpad ollama[1607]: [GIN] 2024/12/27 - 15:14:33 | 200 |         3m21s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:19:38 launchpad ollama[1607]: time=2024-12-27T15:19:38.701-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.133624605 model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c
+Dec 27 15:19:38 launchpad ollama[1607]: time=2024-12-27T15:19:38.951-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.383435707 model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c
+Dec 27 15:19:39 launchpad ollama[1607]: time=2024-12-27T15:19:39.201-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.633218749 model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c
+Dec 27 15:22:58 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 27 15:23:03 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 27 15:23:03 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 27 15:23:03 launchpad systemd[1]: ollama.service: Consumed 1h 46min 7.781s CPU time, 14G memory peak, 11.4G read from disk, 508.1M written to disk.
+-- Boot 44ff52faa26f4443971e9c3d29efe1cd --
+Dec 27 15:23:46 launchpad systemd[1]: Starting Server for local large language models...
+Dec 27 15:23:46 launchpad systemd[1]: Started Server for local large language models.
+Dec 27 15:23:46 launchpad ollama[1503]: 2024/12/27 15:23:46 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 27 15:23:46 launchpad ollama[1503]: time=2024-12-27T15:23:46.207-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 27 15:23:46 launchpad ollama[1503]: time=2024-12-27T15:23:46.212-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 27 15:23:46 launchpad ollama[1503]: time=2024-12-27T15:23:46.213-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 27 15:23:46 launchpad ollama[1503]: time=2024-12-27T15:23:46.214-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2343501151/runners
+Dec 27 15:23:49 launchpad ollama[1503]: time=2024-12-27T15:23:49.125-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Dec 27 15:23:49 launchpad ollama[1503]: time=2024-12-27T15:23:49.126-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 27 15:23:49 launchpad ollama[1503]: time=2024-12-27T15:23:49.126-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 27 15:23:49 launchpad ollama[1503]: time=2024-12-27T15:23:49.126-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 27 15:23:49 launchpad ollama[1503]: time=2024-12-27T15:23:49.126-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 27 15:23:49 launchpad ollama[1503]: time=2024-12-27T15:23:49.357-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Dec 27 15:25:31 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:25:31 | 200 |    2.681026ms |       127.0.0.1 | GET      "/api/tags"
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.790-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.954-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10755571712 required="9.2 GiB"
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.954-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.6 GiB" free_swap="68.9 GiB"
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.955-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[10.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.956-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 33093"
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.956-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.956-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 15:26:14 launchpad ollama[1503]: time=2024-12-27T15:26:14.957-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 15:26:15 launchpad ollama[4157]: INFO [main] build info | build=0 commit="unknown" tid="140345678643200" timestamp=1735341975
+Dec 27 15:26:15 launchpad ollama[4157]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140345678643200" timestamp=1735341975 total_threads=16
+Dec 27 15:26:15 launchpad ollama[4157]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33093" tid="140345678643200" timestamp=1735341975
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - type  f32:   81 tensors
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - type q4_0:  281 tensors
+Dec 27 15:26:15 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 3
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V2
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: vocab type       = SPM
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 32016
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 5120
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 40
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_head           = 40
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 40
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 1
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 13824
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: model type       = 13B
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: model params     = 13.02 B
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: general.name     = codellama
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 1 ''
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 2 ''
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: UNK token        = 0 ''
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_print_meta: max token length = 48
+Dec 27 15:26:15 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 15:26:15 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 15:26:15 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 15:26:15 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 15:26:15 launchpad ollama[1503]: time=2024-12-27T15:26:15.207-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 15:26:15 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 27 15:26:22 launchpad ollama[1503]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 27 15:26:22 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 15:26:22 launchpad ollama[1503]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 27 15:26:22 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 27 15:26:22 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 2048
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 15:26:23 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1286
+Dec 27 15:26:23 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 15:26:23 launchpad ollama[4157]: INFO [main] model loaded | tid="140345678643200" timestamp=1735341983
+Dec 27 15:26:23 launchpad ollama[1503]: time=2024-12-27T15:26:23.733-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.78 seconds"
+Dec 27 15:26:29 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:26:29 | 200 |  14.38802252s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:27:16 launchpad ollama[1503]: time=2024-12-27T15:27:16.658-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:27:31 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:27:31 | 200 | 14.992202305s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:30:08 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:30:08 | 404 |     133.706µs |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:30:19 launchpad ollama[1503]: time=2024-12-27T15:30:19.619-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:30:26 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:30:26 | 200 |  6.441713687s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:30:26 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:30:26 | 200 |    4.374709ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:30:26 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:30:26 | 200 |    3.253192ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:31:16 launchpad ollama[1503]: time=2024-12-27T15:31:16.387-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:31:16 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=165044 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735342276
+Dec 27 15:31:24 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:31:24 | 200 |   8.40532896s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:31:24 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:31:24 | 200 |    3.140186ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:31:24 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:31:24 | 200 |    3.681763ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:35:49 launchpad ollama[1503]: time=2024-12-27T15:35:49.701-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:35:49 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=165433 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735342549
+Dec 27 15:35:59 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:35:59 | 200 |  9.704588023s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:35:59 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:35:59 | 200 |     3.98204ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:35:59 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:35:59 | 200 |    3.572789ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:36:45 launchpad ollama[1503]: time=2024-12-27T15:36:45.921-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:36:46 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=165712 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735342606
+Dec 27 15:36:57 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:36:57 | 200 | 11.632665765s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:36:57 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:36:57 | 200 |    4.151026ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:36:57 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:36:57 | 200 |     3.69001ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:37:08 launchpad ollama[1503]: time=2024-12-27T15:37:08.348-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:37:08 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=166048 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735342628
+Dec 27 15:37:19 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:37:19 | 200 | 11.600473915s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:37:20 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:37:20 | 200 |    3.417516ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:37:20 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:37:20 | 200 |    3.097743ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:38:07 launchpad ollama[1503]: time=2024-12-27T15:38:07.935-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:38:08 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=166391 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735342688
+Dec 27 15:38:17 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:38:17 | 200 |  9.632512934s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:38:17 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:38:17 | 200 |    3.812244ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:38:17 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:38:17 | 200 |    3.174486ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 15:39:18 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:39:18 | 200 |     737.788µs |       127.0.0.1 | GET      "/api/tags"
+Dec 27 15:39:47 launchpad ollama[1503]: time=2024-12-27T15:39:47.134-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:39:53 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:39:53 | 200 |   6.46578068s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:41:02 launchpad ollama[1503]: time=2024-12-27T15:41:02.440-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:41:09 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:41:09 | 200 |  6.900632764s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:42:19 launchpad ollama[1503]: time=2024-12-27T15:42:19.222-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:42:19 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1933 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735342939
+Dec 27 15:42:30 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:42:30 | 200 | 11.479806001s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:45:28 launchpad ollama[1503]: time=2024-12-27T15:45:28.683-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:45:28 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=2262 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343128
+Dec 27 15:45:38 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:45:38 | 200 |  9.804324867s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:47:14 launchpad ollama[1503]: time=2024-12-27T15:47:14.008-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:47:14 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=2554 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343234
+Dec 27 15:47:25 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:47:25 | 200 | 11.349651911s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:48:15 launchpad ollama[1503]: time=2024-12-27T15:48:15.040-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:48:15 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=2887 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343295
+Dec 27 15:48:25 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:48:25 | 200 | 10.460803877s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:49:06 launchpad ollama[1503]: time=2024-12-27T15:49:06.009-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:49:06 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3193 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343346
+Dec 27 15:49:09 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:49:09 | 200 |  3.493525323s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:49:45 launchpad ollama[1503]: time=2024-12-27T15:49:45.446-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:49:45 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3282 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343385
+Dec 27 15:49:48 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:49:48 | 200 |  3.353399063s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:50:23 launchpad ollama[1503]: time=2024-12-27T15:50:23.898-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:50:23 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3359 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343423
+Dec 27 15:50:32 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:50:32 | 200 |  8.290711857s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:50:45 launchpad ollama[1503]: time=2024-12-27T15:50:45.916-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:50:45 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3598 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343445
+Dec 27 15:50:54 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:50:54 | 200 |  8.171207666s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:54:04 launchpad ollama[1503]: time=2024-12-27T15:54:04.841-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:54:04 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3833 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343644
+Dec 27 15:54:22 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:54:22 | 200 | 17.884059421s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:55:58 launchpad ollama[1503]: time=2024-12-27T15:55:58.184-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:55:58 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=4398 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343758
+Dec 27 15:56:10 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:56:10 | 200 | 12.459036639s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:56:44 launchpad ollama[1503]: time=2024-12-27T15:56:44.814-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:56:44 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=9355 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343804
+Dec 27 15:57:03 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:57:03 | 200 | 18.678989865s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:57:51 launchpad ollama[1503]: time=2024-12-27T15:57:51.645-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:57:51 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=12260 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343871
+Dec 27 15:57:59 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:57:59 | 200 |  7.465078241s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 15:58:38 launchpad ollama[1503]: time=2024-12-27T15:58:38.522-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 15:58:38 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=14838 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735343918
+Dec 27 15:58:55 launchpad ollama[1503]: [GIN] 2024/12/27 - 15:58:55 | 200 | 17.406273881s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:00:27 launchpad ollama[1503]: time=2024-12-27T16:00:27.846-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:00:27 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=17703 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735344027
+Dec 27 16:00:57 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:00:57 | 200 | 29.523803789s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:01:23 launchpad ollama[1503]: time=2024-12-27T16:01:23.237-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:01:23 launchpad ollama[4157]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=20029 n_keep=4 n_left=2044 n_shift=1022 tid="140345678643200" timestamp=1735344083
+Dec 27 16:01:27 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:01:27 | 200 |  4.749219744s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.017-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.196-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10401546240 required="9.2 GiB"
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.196-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.197-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.197-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 40899"
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.198-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.198-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.198-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 16:27:44 launchpad ollama[15881]: INFO [main] build info | build=0 commit="unknown" tid="139803143118848" timestamp=1735345664
+Dec 27 16:27:44 launchpad ollama[15881]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139803143118848" timestamp=1735345664 total_threads=16
+Dec 27 16:27:44 launchpad ollama[15881]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40899" tid="139803143118848" timestamp=1735345664
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - type  f32:   81 tensors
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - type q4_0:  281 tensors
+Dec 27 16:27:44 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 3
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V2
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: vocab type       = SPM
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 32016
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 5120
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 40
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_head           = 40
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 40
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 1
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 13824
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: model type       = 13B
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: model params     = 13.02 B
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: general.name     = codellama
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 1 ''
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 2 ''
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: UNK token        = 0 ''
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_print_meta: max token length = 48
+Dec 27 16:27:44 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 16:27:44 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 16:27:44 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 16:27:44 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 27 16:27:44 launchpad ollama[1503]: time=2024-12-27T16:27:44.475-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 27 16:27:44 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 2048
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 16:27:45 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1286
+Dec 27 16:27:45 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 16:27:45 launchpad ollama[15881]: INFO [main] model loaded | tid="139803143118848" timestamp=1735345665
+Dec 27 16:27:45 launchpad ollama[1503]: time=2024-12-27T16:27:45.478-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.28 seconds"
+Dec 27 16:27:45 launchpad ollama[15881]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=20362 n_keep=4 n_left=2044 n_shift=1022 tid="139803143118848" timestamp=1735345665
+Dec 27 16:27:56 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:27:56 | 200 | 12.863784421s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:31:37 launchpad ollama[1503]: time=2024-12-27T16:31:37.055-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:31:37 launchpad ollama[15881]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=20942 n_keep=4 n_left=2044 n_shift=1022 tid="139803143118848" timestamp=1735345897
+Dec 27 16:31:46 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:31:46 | 200 |  9.560043699s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:34:24 launchpad ollama[1503]: time=2024-12-27T16:34:24.627-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:34:24 launchpad ollama[15881]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=21379 n_keep=4 n_left=2044 n_shift=1022 tid="139803143118848" timestamp=1735346064
+Dec 27 16:34:33 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:34:33 | 200 |  8.980140604s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:37:51 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:37:51 | 200 |      15.551µs |       127.0.0.1 | HEAD     "/"
+Dec 27 16:37:51 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:37:51 | 200 |   17.029928ms |       127.0.0.1 | POST     "/api/show"
+Dec 27 16:37:51 launchpad ollama[1503]: time=2024-12-27T16:37:51.552-08:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="984.6 MiB"
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.193-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10401611776 required="6.2 GiB"
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.193-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.193-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.194-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42711"
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.194-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.194-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.194-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 16:37:52 launchpad ollama[17188]: INFO [main] build info | build=0 commit="unknown" tid="140120121364480" timestamp=1735346272
+Dec 27 16:37:52 launchpad ollama[17188]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140120121364480" timestamp=1735346272 total_threads=16
+Dec 27 16:37:52 launchpad ollama[17188]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42711" tid="140120121364480" timestamp=1735346272
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - type  f32:   65 tensors
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - type q4_0:  225 tensors
+Dec 27 16:37:52 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 256
+Dec 27 16:37:52 launchpad ollama[1503]: time=2024-12-27T16:37:52.445-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: vocab type       = BPE
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 128256
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 280147
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 4096
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 32
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_head           = 32
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 8
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 4
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 14336
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: model type       = 8B
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: model params     = 8.03 B
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_print_meta: max token length = 256
+Dec 27 16:37:52 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 16:37:52 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 16:37:52 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 16:37:52 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 16:37:52 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 27 16:37:57 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 16:37:57 launchpad ollama[1503]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 27 16:37:57 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 8192
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 16:37:57 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1030
+Dec 27 16:37:57 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 16:37:57 launchpad ollama[17188]: INFO [main] model loaded | tid="140120121364480" timestamp=1735346277
+Dec 27 16:37:57 launchpad ollama[1503]: time=2024-12-27T16:37:57.961-08:00 level=INFO source=server.go:626 msg="llama runner started in 5.77 seconds"
+Dec 27 16:37:57 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:37:57 | 200 |  6.591417148s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:38:38 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:38:38 | 200 |  605.889284ms |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:39:26 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:39:26 | 200 |  8.232584114s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:40:41 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:40:41 | 200 |  6.235198023s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:41:35 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:41:35 | 200 |  2.848868536s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:42:34 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:42:34 | 200 |   5.35050044s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:44:48 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:44:48 | 200 |  4.922310641s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:45:27 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:45:27 | 200 |   2.44683618s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:47:39 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:47:39 | 200 |  3.946344219s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:48:47 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:48:47 | 200 |  4.527828229s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:50:09 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:50:09 | 200 |      16.397µs |       127.0.0.1 | HEAD     "/"
+Dec 27 16:50:09 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:50:09 | 200 |      62.763µs |       127.0.0.1 | GET      "/api/ps"
+Dec 27 16:50:22 launchpad ollama[1503]: time=2024-12-27T16:50:22.814-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.007-08:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.9 GiB"
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.772-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10401611776 required="9.2 GiB"
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.772-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.773-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.774-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 45797"
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.774-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.774-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 16:50:23 launchpad ollama[1503]: time=2024-12-27T16:50:23.774-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 16:50:23 launchpad ollama[18824]: INFO [main] build info | build=0 commit="unknown" tid="140337552855040" timestamp=1735347023
+Dec 27 16:50:23 launchpad ollama[18824]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140337552855040" timestamp=1735347023 total_threads=16
+Dec 27 16:50:23 launchpad ollama[18824]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45797" tid="140337552855040" timestamp=1735347023
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - type  f32:   81 tensors
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - type q4_0:  281 tensors
+Dec 27 16:50:23 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 3
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V2
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: vocab type       = SPM
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 32016
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 5120
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 40
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_head           = 40
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 40
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 1
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 13824
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: model type       = 13B
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: model params     = 13.02 B
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: general.name     = codellama
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 1 ''
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 2 ''
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: UNK token        = 0 ''
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_print_meta: max token length = 48
+Dec 27 16:50:23 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 16:50:23 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 16:50:23 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 16:50:23 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 16:50:23 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 27 16:50:24 launchpad ollama[1503]: time=2024-12-27T16:50:24.095-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 16:50:24 launchpad ollama[1503]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 27 16:50:24 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 16:50:24 launchpad ollama[1503]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 27 16:50:24 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 27 16:50:24 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 2048
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 16:50:25 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1286
+Dec 27 16:50:25 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 16:50:25 launchpad ollama[18824]: INFO [main] model loaded | tid="140337552855040" timestamp=1735347025
+Dec 27 16:50:25 launchpad ollama[1503]: time=2024-12-27T16:50:25.350-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.58 seconds"
+Dec 27 16:50:25 launchpad ollama[18824]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=21795 n_keep=4 n_left=2044 n_shift=1022 tid="140337552855040" timestamp=1735347025
+Dec 27 16:50:28 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:50:28 | 200 |      17.572µs |       127.0.0.1 | HEAD     "/"
+Dec 27 16:50:28 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:50:28 | 200 |      16.007µs |       127.0.0.1 | GET      "/api/ps"
+Dec 27 16:50:33 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:50:33 | 200 | 11.019714285s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 16:50:54 launchpad ollama[1503]: time=2024-12-27T16:50:54.949-08:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="986.6 MiB"
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.597-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10401611776 required="6.2 GiB"
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.597-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.4 GiB" free_swap="68.9 GiB"
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.597-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.598-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39645"
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.599-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.599-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.599-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 16:50:55 launchpad ollama[18958]: INFO [main] build info | build=0 commit="unknown" tid="140418646966272" timestamp=1735347055
+Dec 27 16:50:55 launchpad ollama[18958]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140418646966272" timestamp=1735347055 total_threads=16
+Dec 27 16:50:55 launchpad ollama[18958]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39645" tid="140418646966272" timestamp=1735347055
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - type  f32:   65 tensors
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - type q4_0:  225 tensors
+Dec 27 16:50:55 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 16:50:55 launchpad ollama[1503]: time=2024-12-27T16:50:55.849-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 256
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V3 (latest)
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: vocab type       = BPE
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 128256
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 280147
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 8192
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 4096
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 32
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_head           = 32
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 8
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 4
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 14336
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 500000.0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: model type       = 8B
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: model params     = 8.03 B
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: LF token         = 128 'Ä'
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_print_meta: max token length = 256
+Dec 27 16:50:55 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 16:50:55 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 16:50:55 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 16:50:55 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 16:50:55 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llm_load_tensors: offloading 32 repeating layers to GPU
+Dec 27 16:50:56 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 16:50:56 launchpad ollama[1503]: llm_load_tensors: offloaded 33/33 layers to GPU
+Dec 27 16:50:56 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 8192
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 500000.0
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 16:50:56 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1030
+Dec 27 16:50:56 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 16:50:56 launchpad ollama[18958]: INFO [main] model loaded | tid="140418646966272" timestamp=1735347056
+Dec 27 16:50:56 launchpad ollama[1503]: time=2024-12-27T16:50:56.853-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Dec 27 16:51:02 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:51:02 | 200 |    7.5564247s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:51:45 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:51:45 | 200 |  3.964105561s |       127.0.0.1 | POST     "/api/chat"
+Dec 27 16:52:49 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:52:49 | 200 |      79.213µs |       127.0.0.1 | HEAD     "/"
+Dec 27 16:52:49 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:52:49 | 200 |      52.777µs |       127.0.0.1 | GET      "/api/ps"
+Dec 27 16:53:39 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:53:39 | 200 |      16.752µs |       127.0.0.1 | HEAD     "/"
+Dec 27 16:53:39 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:53:39 | 200 |      19.085µs |       127.0.0.1 | GET      "/api/ps"
+Dec 27 16:53:50 launchpad ollama[1503]: time=2024-12-27T16:53:50.288-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 16:53:50 launchpad ollama[1503]: time=2024-12-27T16:53:50.470-08:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.9 GiB"
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.223-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10401611776 required="9.2 GiB"
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.223-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.224-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.225-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 45217"
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.226-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.226-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.226-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 16:53:51 launchpad ollama[19486]: INFO [main] build info | build=0 commit="unknown" tid="140631915614208" timestamp=1735347231
+Dec 27 16:53:51 launchpad ollama[19486]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140631915614208" timestamp=1735347231 total_threads=16
+Dec 27 16:53:51 launchpad ollama[19486]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45217" tid="140631915614208" timestamp=1735347231
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - type  f32:   81 tensors
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - type q4_0:  281 tensors
+Dec 27 16:53:51 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 3
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V2
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: vocab type       = SPM
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 32016
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 5120
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 40
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_head           = 40
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 40
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 1
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 13824
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: model type       = 13B
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: model params     = 13.02 B
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: general.name     = codellama
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 1 ''
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 2 ''
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: UNK token        = 0 ''
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_print_meta: max token length = 48
+Dec 27 16:53:51 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 16:53:51 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 16:53:51 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 16:53:51 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 27 16:53:51 launchpad ollama[1503]: time=2024-12-27T16:53:51.541-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 27 16:53:51 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 2048
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 16:53:52 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1286
+Dec 27 16:53:52 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 16:53:52 launchpad ollama[19486]: INFO [main] model loaded | tid="140631915614208" timestamp=1735347232
+Dec 27 16:53:52 launchpad ollama[1503]: time=2024-12-27T16:53:52.545-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Dec 27 16:53:52 launchpad ollama[19486]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=22211 n_keep=4 n_left=2044 n_shift=1022 tid="140631915614208" timestamp=1735347232
+Dec 27 16:53:53 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:53:53 | 200 |      24.294µs |       127.0.0.1 | HEAD     "/"
+Dec 27 16:53:53 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:53:53 | 200 |      20.127µs |       127.0.0.1 | GET      "/api/ps"
+Dec 27 16:54:07 launchpad ollama[1503]: [GIN] 2024/12/27 - 16:54:07 | 200 | 17.579127342s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 17:17:01 launchpad ollama[1503]: time=2024-12-27T17:17:01.944-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.149-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10266869760 required="9.2 GiB"
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.149-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.149-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.150-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2343501151/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 37637"
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.151-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.151-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.151-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Dec 27 17:17:02 launchpad ollama[26672]: INFO [main] build info | build=0 commit="unknown" tid="140471900819456" timestamp=1735348622
+Dec 27 17:17:02 launchpad ollama[26672]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140471900819456" timestamp=1735348622 total_threads=16
+Dec 27 17:17:02 launchpad ollama[26672]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37637" tid="140471900819456" timestamp=1735348622
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - type  f32:   81 tensors
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - type q4_0:  281 tensors
+Dec 27 17:17:02 launchpad ollama[1503]: llama_model_loader: - type q6_K:    1 tensors
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_vocab: special tokens cache size = 3
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: format           = GGUF V2
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: arch             = llama
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: vocab type       = SPM
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_vocab          = 32016
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_merges         = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: vocab_only       = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_ctx_train      = 16384
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_embd           = 5120
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_layer          = 40
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_head           = 40
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_head_kv        = 40
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_rot            = 128
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_swa            = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_k    = 128
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_embd_head_v    = 128
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_gqa            = 1
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_ff             = 13824
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_expert         = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_expert_used    = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: causal attn      = 1
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: pooling type     = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: rope type        = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: rope scaling     = linear
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: freq_base_train  = 1000000.0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: freq_scale_train = 1
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: rope_finetuned   = unknown
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: ssm_d_conv       = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: ssm_d_inner      = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: ssm_d_state      = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_rank      = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: model type       = 13B
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: model ftype      = Q4_0
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: model params     = 13.02 B
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: general.name     = codellama
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: BOS token        = 1 ''
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: EOS token        = 2 ''
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: UNK token        = 0 ''
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: SUF token        = 32008 '▁'
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: MID token        = 32009 '▁'
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: EOT token        = 32010 '▁'
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_print_meta: max token length = 48
+Dec 27 17:17:02 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Dec 27 17:17:02 launchpad ollama[1503]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Dec 27 17:17:02 launchpad ollama[1503]: ggml_cuda_init: found 1 CUDA devices:
+Dec 27 17:17:02 launchpad ollama[1503]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Dec 27 17:17:02 launchpad ollama[1503]: time=2024-12-27T17:17:02.465-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_tensors: offloading 40 repeating layers to GPU
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_tensors: offloading non-repeating layers to GPU
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_tensors: offloaded 41/41 layers to GPU
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Dec 27 17:17:02 launchpad ollama[1503]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: n_ctx      = 2048
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: n_batch    = 512
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: n_ubatch   = 512
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: flash_attn = 0
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: freq_base  = 1000000.0
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: freq_scale = 1
+Dec 27 17:17:03 launchpad ollama[1503]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: graph nodes  = 1286
+Dec 27 17:17:03 launchpad ollama[1503]: llama_new_context_with_model: graph splits = 2
+Dec 27 17:17:03 launchpad ollama[26672]: INFO [main] model loaded | tid="140471900819456" timestamp=1735348623
+Dec 27 17:17:03 launchpad ollama[1503]: time=2024-12-27T17:17:03.721-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.57 seconds"
+Dec 27 17:17:03 launchpad ollama[26672]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=22719 n_keep=4 n_left=2044 n_shift=1022 tid="140471900819456" timestamp=1735348623
+Dec 27 17:17:09 launchpad ollama[1503]: [GIN] 2024/12/27 - 17:17:09 | 200 |  7.869648553s |       127.0.0.1 | POST     "/api/generate"
+Dec 27 17:19:09 launchpad ollama[1503]: time=2024-12-27T17:19:09.739-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Dec 27 17:19:09 launchpad ollama[26672]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=22918 n_keep=4 n_left=2044 n_shift=1022 tid="140471900819456" timestamp=1735348749
+Dec 27 17:19:27 launchpad ollama[1503]: [GIN] 2024/12/27 - 17:19:27 | 200 | 17.365185082s |       127.0.0.1 | POST     "/api/generate"
+Dec 30 09:35:26 launchpad systemd[1]: Stopping Server for local large language models...
+Dec 30 09:35:27 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Dec 30 09:35:27 launchpad systemd[1]: Stopped Server for local large language models.
+Dec 30 09:35:27 launchpad systemd[1]: ollama.service: Consumed 6min 45.323s CPU time, 12.4G memory peak, 11.4G read from disk, 508.1M written to disk.
+-- Boot 4a5d80c7b4704b6ca2c70beb9642e6c5 --
+Dec 30 09:36:03 launchpad systemd[1]: Starting Server for local large language models...
+Dec 30 09:36:03 launchpad systemd[1]: Started Server for local large language models.
+Dec 30 09:36:03 launchpad ollama[1531]: 2024/12/30 09:36:03 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Dec 30 09:36:03 launchpad ollama[1531]: time=2024-12-30T09:36:03.337-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Dec 30 09:36:03 launchpad ollama[1531]: time=2024-12-30T09:36:03.343-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Dec 30 09:36:03 launchpad ollama[1531]: time=2024-12-30T09:36:03.344-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Dec 30 09:36:03 launchpad ollama[1531]: time=2024-12-30T09:36:03.344-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3263265929/runners
+Dec 30 09:36:06 launchpad ollama[1531]: time=2024-12-30T09:36:06.288-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Dec 30 09:36:06 launchpad ollama[1531]: time=2024-12-30T09:36:06.289-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Dec 30 09:36:06 launchpad ollama[1531]: time=2024-12-30T09:36:06.289-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 30 09:36:06 launchpad ollama[1531]: time=2024-12-30T09:36:06.290-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 30 09:36:06 launchpad ollama[1531]: time=2024-12-30T09:36:06.290-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Dec 30 09:36:06 launchpad ollama[1531]: time=2024-12-30T09:36:06.525-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 01 08:26:01 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 01 08:26:01 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 01 08:26:01 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 01 08:26:01 launchpad systemd[1]: ollama.service: Consumed 4.081s CPU time, 785.6M memory peak, 233.1M read from disk, 508.1M written to disk.
+-- Boot 260c13a7c30249458f47f3db2bdc3453 --
+Jan 01 08:26:35 launchpad systemd[1]: Starting Server for local large language models...
+Jan 01 08:26:35 launchpad systemd[1]: Started Server for local large language models.
+Jan 01 08:26:35 launchpad ollama[1534]: 2025/01/01 08:26:35 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 01 08:26:35 launchpad ollama[1534]: time=2025-01-01T08:26:35.205-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 01 08:26:35 launchpad ollama[1534]: time=2025-01-01T08:26:35.209-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 01 08:26:35 launchpad ollama[1534]: time=2025-01-01T08:26:35.210-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 01 08:26:35 launchpad ollama[1534]: time=2025-01-01T08:26:35.211-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2383843368/runners
+Jan 01 08:26:38 launchpad ollama[1534]: time=2025-01-01T08:26:38.161-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 01 08:26:38 launchpad ollama[1534]: time=2025-01-01T08:26:38.161-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 01 08:26:38 launchpad ollama[1534]: time=2025-01-01T08:26:38.161-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 01 08:26:38 launchpad ollama[1534]: time=2025-01-01T08:26:38.162-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 01 08:26:38 launchpad ollama[1534]: time=2025-01-01T08:26:38.162-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 01 08:26:38 launchpad ollama[1534]: time=2025-01-01T08:26:38.370-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 01 17:02:39 launchpad ollama[1534]: [GIN] 2025/01/01 - 17:02:39 | 200 |     561.176µs |       127.0.0.1 | HEAD     "/"
+Jan 01 17:02:39 launchpad ollama[1534]: [GIN] 2025/01/01 - 17:02:39 | 200 |    6.095628ms |       127.0.0.1 | POST     "/api/show"
+Jan 01 17:02:39 launchpad ollama[1534]: time=2025-01-01T17:02:39.875-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.036-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="50.0 GiB" free_swap="68.9 GiB"
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.036-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=0 layers.split="" memory.available="[253.1 MiB]" memory.gpu_overhead="0 B" memory.required.full="8.3 GiB" memory.required.partial="0 B" memory.required.kv="1.6 GiB" memory.required.allocations="[0 B]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.038-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2383843368/runners/cpu_avx2/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --no-mmap --parallel 1 --port 42311"
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.038-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.038-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.038-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 01 17:02:40 launchpad ollama[80568]: INFO [main] build info | build=0 commit="unknown" tid="140151064412864" timestamp=1735779760
+Jan 01 17:02:40 launchpad ollama[80568]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140151064412864" timestamp=1735779760 total_threads=16
+Jan 01 17:02:40 launchpad ollama[80568]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42311" tid="140151064412864" timestamp=1735779760
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - type  f32:   81 tensors
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - type q4_0:  281 tensors
+Jan 01 17:02:40 launchpad ollama[1534]: llama_model_loader: - type q6_K:    1 tensors
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_vocab: special tokens cache size = 3
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: format           = GGUF V2
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: arch             = llama
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: vocab type       = SPM
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_vocab          = 32016
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_merges         = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: vocab_only       = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_ctx_train      = 16384
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_embd           = 5120
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_layer          = 40
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_head           = 40
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_head_kv        = 40
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_rot            = 128
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_swa            = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_gqa            = 1
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_ff             = 13824
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_expert         = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_expert_used    = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: causal attn      = 1
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: pooling type     = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: rope type        = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: rope scaling     = linear
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: freq_base_train  = 1000000.0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: freq_scale_train = 1
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: ssm_d_state      = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: model type       = 13B
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: model ftype      = Q4_0
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: model params     = 13.02 B
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: general.name     = codellama
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: BOS token        = 1 ''
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: EOS token        = 2 ''
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: UNK token        = 0 ''
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: SUF token        = 32008 '▁'
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: MID token        = 32009 '▁'
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: EOT token        = 32010 '▁'
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_print_meta: max token length = 48
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_tensors: ggml ctx size =    0.17 MiB
+Jan 01 17:02:40 launchpad ollama[1534]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Jan 01 17:02:40 launchpad ollama[1534]: time=2025-01-01T17:02:40.289-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 01 17:02:48 launchpad ollama[1534]: llama_new_context_with_model: n_ctx      = 2048
+Jan 01 17:02:48 launchpad ollama[1534]: llama_new_context_with_model: n_batch    = 512
+Jan 01 17:02:48 launchpad ollama[1534]: llama_new_context_with_model: n_ubatch   = 512
+Jan 01 17:02:48 launchpad ollama[1534]: llama_new_context_with_model: flash_attn = 0
+Jan 01 17:02:48 launchpad ollama[1534]: llama_new_context_with_model: freq_base  = 1000000.0
+Jan 01 17:02:48 launchpad ollama[1534]: llama_new_context_with_model: freq_scale = 1
+Jan 01 17:02:49 launchpad ollama[1534]: llama_kv_cache_init:        CPU KV buffer size =  1600.00 MiB
+Jan 01 17:02:49 launchpad ollama[1534]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Jan 01 17:02:49 launchpad ollama[1534]: llama_new_context_with_model:        CPU  output buffer size =     0.14 MiB
+Jan 01 17:02:49 launchpad ollama[1534]: llama_new_context_with_model:        CPU compute buffer size =   204.01 MiB
+Jan 01 17:02:49 launchpad ollama[1534]: llama_new_context_with_model: graph nodes  = 1286
+Jan 01 17:02:49 launchpad ollama[1534]: llama_new_context_with_model: graph splits = 1
+Jan 01 17:02:49 launchpad ollama[80568]: INFO [main] model loaded | tid="140151064412864" timestamp=1735779769
+Jan 01 17:02:49 launchpad ollama[1534]: time=2025-01-01T17:02:49.567-08:00 level=INFO source=server.go:626 msg="llama runner started in 9.53 seconds"
+Jan 01 17:02:49 launchpad ollama[1534]: [GIN] 2025/01/01 - 17:02:49 | 200 |  9.695901175s |       127.0.0.1 | POST     "/api/generate"
+Jan 01 17:03:12 launchpad ollama[1534]: time=2025-01-01T17:03:12.114-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 01 17:04:15 launchpad ollama[1534]: [GIN] 2025/01/01 - 17:04:15 | 200 |          1m3s |       127.0.0.1 | POST     "/api/chat"
+Jan 01 17:09:15 launchpad ollama[1534]: cuda driver library failed to get device context 2time=2025-01-01T17:09:15.288-08:00 level=WARN source=gpu.go:400 msg="error looking up nvidia GPU memory"
+Jan 01 17:09:15 launchpad ollama[1534]: cuda driver library failed to get device context 2time=2025-01-01T17:09:15.656-08:00 level=WARN source=gpu.go:400 msg="error looking up nvidia GPU memory"
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.769-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.930-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="50.2 GiB" free_swap="68.9 GiB"
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.931-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=0 layers.split="" memory.available="[1.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="8.3 GiB" memory.required.partial="0 B" memory.required.kv="1.6 GiB" memory.required.allocations="[0 B]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.932-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2383843368/runners/cpu_avx2/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --no-mmap --parallel 1 --port 39691"
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.932-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.932-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 01 18:58:56 launchpad ollama[1534]: time=2025-01-01T18:58:56.932-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 01 18:58:56 launchpad ollama[96633]: INFO [main] build info | build=0 commit="unknown" tid="139871342240448" timestamp=1735786736
+Jan 01 18:58:56 launchpad ollama[96633]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139871342240448" timestamp=1735786736 total_threads=16
+Jan 01 18:58:56 launchpad ollama[96633]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39691" tid="139871342240448" timestamp=1735786736
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - type  f32:   81 tensors
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - type q4_0:  281 tensors
+Jan 01 18:58:56 launchpad ollama[1534]: llama_model_loader: - type q6_K:    1 tensors
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_vocab: special tokens cache size = 3
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: format           = GGUF V2
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: arch             = llama
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: vocab type       = SPM
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_vocab          = 32016
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_merges         = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: vocab_only       = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_ctx_train      = 16384
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_embd           = 5120
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_layer          = 40
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_head           = 40
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_head_kv        = 40
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_rot            = 128
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_swa            = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_gqa            = 1
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_ff             = 13824
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_expert         = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_expert_used    = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: causal attn      = 1
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: pooling type     = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: rope type        = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: rope scaling     = linear
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: freq_base_train  = 1000000.0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: freq_scale_train = 1
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: ssm_d_state      = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: model type       = 13B
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: model ftype      = Q4_0
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: model params     = 13.02 B
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: general.name     = codellama
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: BOS token        = 1 ''
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: EOS token        = 2 ''
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: UNK token        = 0 ''
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: SUF token        = 32008 '▁'
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: MID token        = 32009 '▁'
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: EOT token        = 32010 '▁'
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_print_meta: max token length = 48
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_tensors: ggml ctx size =    0.17 MiB
+Jan 01 18:58:56 launchpad ollama[1534]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Jan 01 18:58:57 launchpad ollama[1534]: time=2025-01-01T18:58:57.184-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 01 18:58:58 launchpad ollama[1534]: llama_new_context_with_model: n_ctx      = 2048
+Jan 01 18:58:58 launchpad ollama[1534]: llama_new_context_with_model: n_batch    = 512
+Jan 01 18:58:58 launchpad ollama[1534]: llama_new_context_with_model: n_ubatch   = 512
+Jan 01 18:58:58 launchpad ollama[1534]: llama_new_context_with_model: flash_attn = 0
+Jan 01 18:58:58 launchpad ollama[1534]: llama_new_context_with_model: freq_base  = 1000000.0
+Jan 01 18:58:58 launchpad ollama[1534]: llama_new_context_with_model: freq_scale = 1
+Jan 01 18:58:59 launchpad ollama[1534]: llama_kv_cache_init:        CPU KV buffer size =  1600.00 MiB
+Jan 01 18:58:59 launchpad ollama[1534]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Jan 01 18:58:59 launchpad ollama[1534]: llama_new_context_with_model:        CPU  output buffer size =     0.14 MiB
+Jan 01 18:58:59 launchpad ollama[1534]: llama_new_context_with_model:        CPU compute buffer size =   204.01 MiB
+Jan 01 18:58:59 launchpad ollama[1534]: llama_new_context_with_model: graph nodes  = 1286
+Jan 01 18:58:59 launchpad ollama[1534]: llama_new_context_with_model: graph splits = 1
+Jan 01 18:58:59 launchpad ollama[96633]: INFO [main] model loaded | tid="139871342240448" timestamp=1735786739
+Jan 01 18:58:59 launchpad ollama[1534]: time=2025-01-01T18:58:59.693-08:00 level=INFO source=server.go:626 msg="llama runner started in 2.76 seconds"
+Jan 01 19:00:17 launchpad ollama[1534]: [GIN] 2025/01/01 - 19:00:17 | 200 |         1m20s |       127.0.0.1 | POST     "/api/chat"
+Jan 01 19:00:55 launchpad ollama[1534]: time=2025-01-01T19:00:55.488-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 01 19:01:16 launchpad ollama[1534]: [GIN] 2025/01/01 - 19:01:16 | 200 | 21.465450198s |       127.0.0.1 | POST     "/api/chat"
+Jan 01 19:03:15 launchpad ollama[1534]: time=2025-01-01T19:03:15.006-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 01 19:03:44 launchpad ollama[1534]: [GIN] 2025/01/01 - 19:03:44 | 200 |  29.17834599s |       127.0.0.1 | POST     "/api/chat"
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.111-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.268-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.0 GiB" free_swap="68.9 GiB"
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.268-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=2 layers.split="" memory.available="[1.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="1.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[1.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.269-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2383843368/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 2 --parallel 1 --port 41601"
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.269-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.269-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.269-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 01 19:10:40 launchpad ollama[100100]: INFO [main] build info | build=0 commit="unknown" tid="140659831738368" timestamp=1735787440
+Jan 01 19:10:40 launchpad ollama[100100]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140659831738368" timestamp=1735787440 total_threads=16
+Jan 01 19:10:40 launchpad ollama[100100]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41601" tid="140659831738368" timestamp=1735787440
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - type  f32:   81 tensors
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - type q4_0:  281 tensors
+Jan 01 19:10:40 launchpad ollama[1534]: llama_model_loader: - type q6_K:    1 tensors
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_vocab: special tokens cache size = 3
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: format           = GGUF V2
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: arch             = llama
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: vocab type       = SPM
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_vocab          = 32016
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_merges         = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: vocab_only       = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_ctx_train      = 16384
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_embd           = 5120
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_layer          = 40
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_head           = 40
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_head_kv        = 40
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_rot            = 128
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_swa            = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_gqa            = 1
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_ff             = 13824
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_expert         = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_expert_used    = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: causal attn      = 1
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: pooling type     = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: rope type        = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: rope scaling     = linear
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: freq_base_train  = 1000000.0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: freq_scale_train = 1
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: ssm_d_state      = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: model type       = 13B
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: model ftype      = Q4_0
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: model params     = 13.02 B
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: general.name     = codellama
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: BOS token        = 1 ''
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: EOS token        = 2 ''
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: UNK token        = 0 ''
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: SUF token        = 32008 '▁'
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: MID token        = 32009 '▁'
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: EOT token        = 32010 '▁'
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_print_meta: max token length = 48
+Jan 01 19:10:40 launchpad ollama[1534]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 01 19:10:40 launchpad ollama[1534]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 01 19:10:40 launchpad ollama[1534]: ggml_cuda_init: found 1 CUDA devices:
+Jan 01 19:10:40 launchpad ollama[1534]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 01 19:10:40 launchpad ollama[1534]: time=2025-01-01T19:10:40.520-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_tensors: offloading 2 repeating layers to GPU
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_tensors: offloaded 2/41 layers to GPU
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Jan 01 19:10:40 launchpad ollama[1534]: llm_load_tensors:      CUDA0 buffer size =   340.39 MiB
+Jan 01 19:10:40 launchpad ollama[1534]: llama_new_context_with_model: n_ctx      = 2048
+Jan 01 19:10:40 launchpad ollama[1534]: llama_new_context_with_model: n_batch    = 512
+Jan 01 19:10:40 launchpad ollama[1534]: llama_new_context_with_model: n_ubatch   = 512
+Jan 01 19:10:40 launchpad ollama[1534]: llama_new_context_with_model: flash_attn = 0
+Jan 01 19:10:40 launchpad ollama[1534]: llama_new_context_with_model: freq_base  = 1000000.0
+Jan 01 19:10:40 launchpad ollama[1534]: llama_new_context_with_model: freq_scale = 1
+Jan 01 19:10:41 launchpad ollama[1534]: llama_kv_cache_init:  CUDA_Host KV buffer size =  1520.00 MiB
+Jan 01 19:10:41 launchpad ollama[1534]: llama_kv_cache_init:      CUDA0 KV buffer size =    80.00 MiB
+Jan 01 19:10:41 launchpad ollama[1534]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Jan 01 19:10:41 launchpad ollama[1534]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Jan 01 19:10:41 launchpad ollama[1534]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Jan 01 19:10:41 launchpad ollama[1534]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 01 19:10:41 launchpad ollama[1534]: llama_new_context_with_model: graph nodes  = 1286
+Jan 01 19:10:41 launchpad ollama[1534]: llama_new_context_with_model: graph splits = 422
+Jan 01 19:10:41 launchpad ollama[100100]: INFO [main] model loaded | tid="140659831738368" timestamp=1735787441
+Jan 01 19:10:41 launchpad ollama[1534]: time=2025-01-01T19:10:41.533-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Jan 01 19:11:25 launchpad ollama[1534]: [GIN] 2025/01/01 - 19:11:25 | 200 | 45.575448992s |       127.0.0.1 | POST     "/api/chat"
+Jan 01 19:16:30 launchpad ollama[1534]: time=2025-01-01T19:16:30.837-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.146357639 model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c
+Jan 01 19:16:31 launchpad ollama[1534]: time=2025-01-01T19:16:31.086-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.395657879 model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c
+Jan 01 19:16:31 launchpad ollama[1534]: time=2025-01-01T19:16:31.337-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.646032647 model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c
+Jan 03 18:31:59 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 03 18:31:59 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 03 18:31:59 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 03 18:31:59 launchpad systemd[1]: ollama.service: Consumed 31min 9.751s CPU time, 16.1G memory peak, 7.1G read from disk, 508.1M written to disk.
+-- Boot 68bc23759a4a4fc28942066d48b2dcba --
+Jan 03 18:32:35 launchpad systemd[1]: Starting Server for local large language models...
+Jan 03 18:32:35 launchpad systemd[1]: Started Server for local large language models.
+Jan 03 18:32:35 launchpad ollama[1531]: 2025/01/03 18:32:35 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 03 18:32:35 launchpad ollama[1531]: time=2025-01-03T18:32:35.336-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 03 18:32:35 launchpad ollama[1531]: time=2025-01-03T18:32:35.340-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 03 18:32:35 launchpad ollama[1531]: time=2025-01-03T18:32:35.342-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 03 18:32:35 launchpad ollama[1531]: time=2025-01-03T18:32:35.343-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2922762215/runners
+Jan 03 18:32:38 launchpad ollama[1531]: time=2025-01-03T18:32:38.275-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 03 18:32:38 launchpad ollama[1531]: time=2025-01-03T18:32:38.275-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 03 18:32:38 launchpad ollama[1531]: time=2025-01-03T18:32:38.276-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 03 18:32:38 launchpad ollama[1531]: time=2025-01-03T18:32:38.276-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 03 18:32:38 launchpad ollama[1531]: time=2025-01-03T18:32:38.276-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 03 18:32:38 launchpad ollama[1531]: time=2025-01-03T18:32:38.480-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 06 15:38:43 launchpad ollama[1531]: [GIN] 2025/01/06 - 15:38:43 | 200 |     781.325µs |       127.0.0.1 | HEAD     "/"
+Jan 06 15:38:43 launchpad ollama[1531]: [GIN] 2025/01/06 - 15:38:43 | 200 |   17.251415ms |       127.0.0.1 | POST     "/api/show"
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.715-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8739618816 required="6.2 GiB"
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.715-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.5 GiB" free_swap="68.9 GiB"
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.715-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.717-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2922762215/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40151"
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.717-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.717-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.717-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 06 15:38:43 launchpad ollama[401857]: INFO [main] build info | build=0 commit="unknown" tid="139790765953024" timestamp=1736206723
+Jan 06 15:38:43 launchpad ollama[401857]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139790765953024" timestamp=1736206723 total_threads=16
+Jan 06 15:38:43 launchpad ollama[401857]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40151" tid="139790765953024" timestamp=1736206723
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - type  f32:   65 tensors
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - type q4_0:  225 tensors
+Jan 06 15:38:43 launchpad ollama[1531]: llama_model_loader: - type q6_K:    1 tensors
+Jan 06 15:38:43 launchpad ollama[1531]: time=2025-01-06T15:38:43.968-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_vocab: special tokens cache size = 256
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: arch             = llama
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: vocab type       = BPE
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_vocab          = 128256
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_merges         = 280147
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: vocab_only       = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_embd           = 4096
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_layer          = 32
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_head           = 32
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_head_kv        = 8
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_rot            = 128
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_swa            = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_gqa            = 4
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_ff             = 14336
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_expert         = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_expert_used    = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: causal attn      = 1
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: pooling type     = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: rope type        = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: rope scaling     = linear
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: freq_scale_train = 1
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: ssm_d_state      = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: model type       = 8B
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: model ftype      = Q4_0
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: model params     = 8.03 B
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_print_meta: max token length = 256
+Jan 06 15:38:44 launchpad ollama[1531]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 06 15:38:44 launchpad ollama[1531]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 06 15:38:44 launchpad ollama[1531]: ggml_cuda_init: found 1 CUDA devices:
+Jan 06 15:38:44 launchpad ollama[1531]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 06 15:38:44 launchpad ollama[1531]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 06 15:38:48 launchpad ollama[1531]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 06 15:38:48 launchpad ollama[1531]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 06 15:38:48 launchpad ollama[1531]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 06 15:38:48 launchpad ollama[1531]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 06 15:38:48 launchpad ollama[1531]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: n_ctx      = 8192
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: n_batch    = 512
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: n_ubatch   = 512
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: flash_attn = 0
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: freq_scale = 1
+Jan 06 15:38:49 launchpad ollama[1531]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: graph nodes  = 1030
+Jan 06 15:38:49 launchpad ollama[1531]: llama_new_context_with_model: graph splits = 2
+Jan 06 15:38:49 launchpad ollama[401857]: INFO [main] model loaded | tid="139790765953024" timestamp=1736206729
+Jan 06 15:38:49 launchpad ollama[1531]: time=2025-01-06T15:38:49.486-08:00 level=INFO source=server.go:626 msg="llama runner started in 5.77 seconds"
+Jan 06 15:38:49 launchpad ollama[1531]: [GIN] 2025/01/06 - 15:38:49 | 200 |  5.951679064s |       127.0.0.1 | POST     "/api/generate"
+Jan 06 15:43:00 launchpad ollama[1531]: [GIN] 2025/01/06 - 15:43:00 | 200 |  7.356289318s |       127.0.0.1 | POST     "/api/chat"
+Jan 06 15:45:54 launchpad ollama[1531]: [GIN] 2025/01/06 - 15:45:54 | 200 |  8.194895954s |       127.0.0.1 | POST     "/api/chat"
+Jan 06 15:50:07 launchpad ollama[1531]: [GIN] 2025/01/06 - 15:50:07 | 200 |  6.660291288s |       127.0.0.1 | POST     "/api/chat"
+-- Boot f57f0e33b6b440d58dc36136ed7e598b --
+Jan 06 19:22:05 launchpad systemd[1]: Starting Server for local large language models...
+Jan 06 19:22:05 launchpad systemd[1]: Started Server for local large language models.
+Jan 06 19:22:05 launchpad ollama[1622]: 2025/01/06 19:22:05 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 06 19:22:05 launchpad ollama[1622]: time=2025-01-06T19:22:05.372-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 06 19:22:05 launchpad ollama[1622]: time=2025-01-06T19:22:05.376-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 06 19:22:05 launchpad ollama[1622]: time=2025-01-06T19:22:05.378-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 06 19:22:05 launchpad ollama[1622]: time=2025-01-06T19:22:05.380-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama448366513/runners
+Jan 06 19:22:08 launchpad ollama[1622]: time=2025-01-06T19:22:08.273-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 06 19:22:08 launchpad ollama[1622]: time=2025-01-06T19:22:08.273-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 06 19:22:08 launchpad ollama[1622]: time=2025-01-06T19:22:08.273-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 06 19:22:08 launchpad ollama[1622]: time=2025-01-06T19:22:08.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 06 19:22:08 launchpad ollama[1622]: time=2025-01-06T19:22:08.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 06 19:22:10 launchpad ollama[1622]: time=2025-01-06T19:22:10.028-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Jan 06 19:23:52 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 06 19:25:22 launchpad systemd[1]: ollama.service: State 'stop-sigterm' timed out. Killing.
+Jan 06 19:25:22 launchpad systemd[1]: ollama.service: Killing process 1622 (.ollama-wrapped) with signal SIGKILL.
+Jan 06 19:25:22 launchpad systemd[1]: ollama.service: Killing process 1666 (.ollama-wrapped) with signal SIGKILL.
+Jan 06 19:26:52 launchpad systemd[1]: ollama.service: Processes still around after SIGKILL. Ignoring.
+Jan 06 19:28:23 launchpad systemd[1]: ollama.service: State 'final-sigterm' timed out. Killing.
+Jan 06 19:28:23 launchpad systemd[1]: ollama.service: Killing process 1622 (.ollama-wrapped) with signal SIGKILL.
+Jan 06 19:28:23 launchpad systemd[1]: ollama.service: Killing process 1666 (.ollama-wrapped) with signal SIGKILL.
+-- Boot 57b5dff306f2462f905c3306555265b4 --
+Jan 06 19:30:26 launchpad systemd[1]: Starting Server for local large language models...
+Jan 06 19:30:26 launchpad systemd[1]: Started Server for local large language models.
+Jan 06 19:30:26 launchpad ollama[1534]: 2025/01/06 19:30:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 06 19:30:26 launchpad ollama[1534]: time=2025-01-06T19:30:26.355-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 06 19:30:26 launchpad ollama[1534]: time=2025-01-06T19:30:26.360-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 06 19:30:26 launchpad ollama[1534]: time=2025-01-06T19:30:26.360-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 06 19:30:26 launchpad ollama[1534]: time=2025-01-06T19:30:26.362-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2024859506/runners
+Jan 06 19:30:29 launchpad ollama[1534]: time=2025-01-06T19:30:29.265-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 06 19:30:29 launchpad ollama[1534]: time=2025-01-06T19:30:29.265-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 06 19:30:29 launchpad ollama[1534]: time=2025-01-06T19:30:29.265-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 06 19:30:29 launchpad ollama[1534]: time=2025-01-06T19:30:29.266-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 06 19:30:29 launchpad ollama[1534]: time=2025-01-06T19:30:29.266-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 06 19:30:29 launchpad ollama[1534]: time=2025-01-06T19:30:29.498-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 06 19:37:04 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 06 19:37:04 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 06 19:37:04 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 06 19:37:04 launchpad systemd[1]: ollama.service: Consumed 3.346s CPU time, 786.2M memory peak, 233.4M read from disk, 508.1M written to disk.
+-- Boot 9d5500554b66400fb07cf0b1edd61af5 --
+Jan 07 07:34:16 launchpad systemd[1]: Starting Server for local large language models...
+Jan 07 07:34:16 launchpad systemd[1]: Started Server for local large language models.
+Jan 07 07:34:16 launchpad ollama[1540]: 2025/01/07 07:34:16 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 07 07:34:16 launchpad ollama[1540]: time=2025-01-07T07:34:16.277-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 07 07:34:16 launchpad ollama[1540]: time=2025-01-07T07:34:16.284-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 07 07:34:16 launchpad ollama[1540]: time=2025-01-07T07:34:16.285-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 07 07:34:16 launchpad ollama[1540]: time=2025-01-07T07:34:16.286-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama758438586/runners
+Jan 07 07:34:19 launchpad ollama[1540]: time=2025-01-07T07:34:19.365-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Jan 07 07:34:19 launchpad ollama[1540]: time=2025-01-07T07:34:19.366-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 07 07:34:19 launchpad ollama[1540]: time=2025-01-07T07:34:19.366-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 07:34:19 launchpad ollama[1540]: time=2025-01-07T07:34:19.366-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 07:34:19 launchpad ollama[1540]: time=2025-01-07T07:34:19.366-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 07:34:19 launchpad ollama[1540]: time=2025-01-07T07:34:19.554-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 07 12:53:59 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 07 12:53:59 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 07 12:53:59 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 07 12:53:59 launchpad systemd[1]: ollama.service: Consumed 3.700s CPU time, 787.8M memory peak, 233M read from disk, 508.1M written to disk.
+-- Boot f682186d89a94a0bba220de6233eee6e --
+Jan 07 12:54:31 launchpad systemd[1]: Starting Server for local large language models...
+Jan 07 12:54:31 launchpad systemd[1]: Started Server for local large language models.
+Jan 07 12:54:31 launchpad ollama[1532]: 2025/01/07 12:54:31 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 07 12:54:31 launchpad ollama[1532]: time=2025-01-07T12:54:31.428-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 07 12:54:31 launchpad ollama[1532]: time=2025-01-07T12:54:31.434-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 07 12:54:31 launchpad ollama[1532]: time=2025-01-07T12:54:31.435-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 07 12:54:31 launchpad ollama[1532]: time=2025-01-07T12:54:31.435-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama446181134/runners
+Jan 07 12:54:34 launchpad ollama[1532]: time=2025-01-07T12:54:34.396-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Jan 07 12:54:34 launchpad ollama[1532]: time=2025-01-07T12:54:34.396-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 07 12:54:34 launchpad ollama[1532]: time=2025-01-07T12:54:34.396-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 12:54:34 launchpad ollama[1532]: time=2025-01-07T12:54:34.397-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 12:54:34 launchpad ollama[1532]: time=2025-01-07T12:54:34.397-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 12:54:34 launchpad ollama[1532]: time=2025-01-07T12:54:34.622-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 07 13:18:52 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 07 13:18:52 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 07 13:18:52 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 07 13:18:52 launchpad systemd[1]: ollama.service: Consumed 3.414s CPU time, 786M memory peak, 233M read from disk, 508.1M written to disk.
+-- Boot ec29c3ea9d1f4afb95b8205796087b74 --
+Jan 07 13:19:24 launchpad systemd[1]: Starting Server for local large language models...
+Jan 07 13:19:24 launchpad systemd[1]: Started Server for local large language models.
+Jan 07 13:19:24 launchpad ollama[1526]: 2025/01/07 13:19:24 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 07 13:19:24 launchpad ollama[1526]: time=2025-01-07T13:19:24.895-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 07 13:19:24 launchpad ollama[1526]: time=2025-01-07T13:19:24.901-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 07 13:19:24 launchpad ollama[1526]: time=2025-01-07T13:19:24.902-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 07 13:19:24 launchpad ollama[1526]: time=2025-01-07T13:19:24.905-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2599412483/runners
+Jan 07 13:19:27 launchpad ollama[1526]: time=2025-01-07T13:19:27.799-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Jan 07 13:19:27 launchpad ollama[1526]: time=2025-01-07T13:19:27.800-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 07 13:19:27 launchpad ollama[1526]: time=2025-01-07T13:19:27.800-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 13:19:27 launchpad ollama[1526]: time=2025-01-07T13:19:27.800-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 13:19:27 launchpad ollama[1526]: time=2025-01-07T13:19:27.800-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 13:19:28 launchpad ollama[1526]: time=2025-01-07T13:19:28.015-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.0 GiB"
+Jan 07 18:52:45 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 07 18:52:45 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 07 18:52:45 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 07 18:52:45 launchpad systemd[1]: ollama.service: Consumed 3.632s CPU time, 786M memory peak, 233.2M read from disk, 508.1M written to disk.
+-- Boot 855489849095400cab96d7df50fa4664 --
+Jan 07 18:53:39 launchpad systemd[1]: Starting Server for local large language models...
+Jan 07 18:53:39 launchpad systemd[1]: Started Server for local large language models.
+Jan 07 18:53:40 launchpad ollama[1595]: 2025/01/07 18:53:40 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 07 18:53:40 launchpad ollama[1595]: time=2025-01-07T18:53:40.034-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 07 18:53:40 launchpad ollama[1595]: time=2025-01-07T18:53:40.042-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 07 18:53:40 launchpad ollama[1595]: time=2025-01-07T18:53:40.043-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 07 18:53:40 launchpad ollama[1595]: time=2025-01-07T18:53:40.045-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2772112966/runners
+Jan 07 18:53:43 launchpad ollama[1595]: time=2025-01-07T18:53:43.063-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 07 18:53:43 launchpad ollama[1595]: time=2025-01-07T18:53:43.064-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 07 18:53:43 launchpad ollama[1595]: time=2025-01-07T18:53:43.064-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 18:53:43 launchpad ollama[1595]: time=2025-01-07T18:53:43.064-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 18:53:43 launchpad ollama[1595]: time=2025-01-07T18:53:43.064-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 18:53:44 launchpad ollama[1595]: time=2025-01-07T18:53:44.819-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Jan 07 18:54:19 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 07 18:54:19 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 07 18:54:20 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 07 18:54:20 launchpad systemd[1]: ollama.service: Consumed 5.283s CPU time, 789M memory peak, 235.9M read from disk, 508.1M written to disk.
+-- Boot 294cf7b93f014947854eb454411dbc1d --
+Jan 07 18:54:51 launchpad systemd[1]: Starting Server for local large language models...
+Jan 07 18:54:51 launchpad systemd[1]: Started Server for local large language models.
+Jan 07 18:54:51 launchpad ollama[1618]: 2025/01/07 18:54:51 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 07 18:54:51 launchpad ollama[1618]: time=2025-01-07T18:54:51.180-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 07 18:54:51 launchpad ollama[1618]: time=2025-01-07T18:54:51.185-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 07 18:54:51 launchpad ollama[1618]: time=2025-01-07T18:54:51.186-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 07 18:54:51 launchpad ollama[1618]: time=2025-01-07T18:54:51.187-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3742342109/runners
+Jan 07 18:54:54 launchpad ollama[1618]: time=2025-01-07T18:54:54.093-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 07 18:54:54 launchpad ollama[1618]: time=2025-01-07T18:54:54.094-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 07 18:54:54 launchpad ollama[1618]: time=2025-01-07T18:54:54.094-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 18:54:54 launchpad ollama[1618]: time=2025-01-07T18:54:54.095-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 18:54:54 launchpad ollama[1618]: time=2025-01-07T18:54:54.095-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 07 18:54:54 launchpad ollama[1618]: time=2025-01-07T18:54:54.330-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 08 08:55:07 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 08 08:55:07 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 08 08:55:07 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 08 08:55:07 launchpad systemd[1]: ollama.service: Consumed 3.399s CPU time, 785.5M memory peak, 233.1M read from disk, 508.1M written to disk.
+-- Boot 386ee235859d401d9b09ce3536af41c0 --
+Jan 08 08:55:39 launchpad systemd[1]: Starting Server for local large language models...
+Jan 08 08:55:39 launchpad systemd[1]: Started Server for local large language models.
+Jan 08 08:55:39 launchpad ollama[1534]: 2025/01/08 08:55:39 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 08 08:55:39 launchpad ollama[1534]: time=2025-01-08T08:55:39.817-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 08 08:55:39 launchpad ollama[1534]: time=2025-01-08T08:55:39.822-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 08 08:55:39 launchpad ollama[1534]: time=2025-01-08T08:55:39.822-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 08 08:55:39 launchpad ollama[1534]: time=2025-01-08T08:55:39.824-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2571630/runners
+Jan 08 08:55:42 launchpad ollama[1534]: time=2025-01-08T08:55:42.791-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 08 08:55:42 launchpad ollama[1534]: time=2025-01-08T08:55:42.791-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 08 08:55:42 launchpad ollama[1534]: time=2025-01-08T08:55:42.791-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 08 08:55:42 launchpad ollama[1534]: time=2025-01-08T08:55:42.792-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 08 08:55:42 launchpad ollama[1534]: time=2025-01-08T08:55:42.792-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 08 08:55:42 launchpad ollama[1534]: time=2025-01-08T08:55:42.999-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.0 GiB"
+Jan 09 14:19:09 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 09 14:19:09 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 09 14:19:09 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 09 14:19:09 launchpad systemd[1]: ollama.service: Consumed 3.964s CPU time, 786M memory peak, 233.2M read from disk, 508.1M written to disk.
+-- Boot d7b8ec2daadb429b9bb273c7a542d90a --
+Jan 09 14:19:14 launchpad systemd[1]: Starting Server for local large language models...
+Jan 09 14:19:14 launchpad systemd[1]: Started Server for local large language models.
+Jan 09 14:19:14 launchpad ollama[1530]: 2025/01/09 14:19:14 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 09 14:19:14 launchpad ollama[1530]: time=2025-01-09T14:19:14.621-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 09 14:19:14 launchpad ollama[1530]: time=2025-01-09T14:19:14.627-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 09 14:19:14 launchpad ollama[1530]: time=2025-01-09T14:19:14.628-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 09 14:19:14 launchpad ollama[1530]: time=2025-01-09T14:19:14.628-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama782984971/runners
+Jan 09 14:19:17 launchpad ollama[1530]: time=2025-01-09T14:19:17.541-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 09 14:19:17 launchpad ollama[1530]: time=2025-01-09T14:19:17.542-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 09 14:19:17 launchpad ollama[1530]: time=2025-01-09T14:19:17.542-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 09 14:19:17 launchpad ollama[1530]: time=2025-01-09T14:19:17.542-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 09 14:19:17 launchpad ollama[1530]: time=2025-01-09T14:19:17.542-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 09 14:19:17 launchpad ollama[1530]: time=2025-01-09T14:19:17.753-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 12 09:46:00 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 12 09:46:00 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 12 09:46:00 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 12 09:46:00 launchpad systemd[1]: ollama.service: Consumed 4.351s CPU time, 786.2M memory peak, 233.3M read from disk, 508.1M written to disk.
+Jan 12 09:46:15 launchpad systemd[1]: Starting Server for local large language models...
+Jan 12 09:46:15 launchpad systemd[1]: Started Server for local large language models.
+Jan 12 09:46:15 launchpad ollama[430897]: 2025/01/12 09:46:15 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 12 09:46:15 launchpad ollama[430897]: time=2025-01-12T09:46:15.353-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 12 09:46:15 launchpad ollama[430897]: time=2025-01-12T09:46:15.357-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 12 09:46:15 launchpad ollama[430897]: time=2025-01-12T09:46:15.357-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 12 09:46:15 launchpad ollama[430897]: time=2025-01-12T09:46:15.358-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3398906615/runners
+Jan 12 09:46:18 launchpad ollama[430897]: time=2025-01-12T09:46:18.274-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 12 09:46:18 launchpad ollama[430897]: time=2025-01-12T09:46:18.274-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 12 09:46:18 launchpad ollama[430897]: time=2025-01-12T09:46:18.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 09:46:18 launchpad ollama[430897]: time=2025-01-12T09:46:18.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 09:46:18 launchpad ollama[430897]: time=2025-01-12T09:46:18.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 09:46:18 launchpad ollama[430897]: time=2025-01-12T09:46:18.540-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="8.7 GiB"
+Jan 12 12:17:22 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 12 12:17:22 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 12 12:17:22 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 12 12:17:22 launchpad systemd[1]: ollama.service: Consumed 3.492s CPU time, 560.1M memory peak, 7.5M read from disk, 508.1M written to disk.
+-- Boot e311e8cc46c445b8bdb99d06997efb8d --
+Jan 12 12:18:05 launchpad systemd[1]: Starting Server for local large language models...
+Jan 12 12:18:05 launchpad systemd[1]: Started Server for local large language models.
+Jan 12 12:18:05 launchpad ollama[1516]: 2025/01/12 12:18:05 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 12 12:18:05 launchpad ollama[1516]: time=2025-01-12T12:18:05.518-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 12 12:18:05 launchpad ollama[1516]: time=2025-01-12T12:18:05.523-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 12 12:18:05 launchpad ollama[1516]: time=2025-01-12T12:18:05.525-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 12 12:18:05 launchpad ollama[1516]: time=2025-01-12T12:18:05.526-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4005739148/runners
+Jan 12 12:18:08 launchpad ollama[1516]: time=2025-01-12T12:18:08.355-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Jan 12 12:18:08 launchpad ollama[1516]: time=2025-01-12T12:18:08.355-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 12 12:18:08 launchpad ollama[1516]: time=2025-01-12T12:18:08.355-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:18:08 launchpad ollama[1516]: time=2025-01-12T12:18:08.356-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:18:08 launchpad ollama[1516]: time=2025-01-12T12:18:08.356-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:18:10 launchpad ollama[1516]: time=2025-01-12T12:18:10.078-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Jan 12 12:18:25 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 12 12:18:26 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 12 12:18:26 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 12 12:18:26 launchpad systemd[1]: ollama.service: Consumed 5.107s CPU time, 786.4M memory peak, 233.4M read from disk, 508.1M written to disk.
+-- Boot 92062c6e1bfb461c92e69af8c3bc6804 --
+Jan 12 12:19:04 launchpad systemd[1]: Starting Server for local large language models...
+Jan 12 12:19:04 launchpad systemd[1]: Started Server for local large language models.
+Jan 12 12:19:04 launchpad ollama[1519]: 2025/01/12 12:19:04 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 12 12:19:04 launchpad ollama[1519]: time=2025-01-12T12:19:04.322-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 12 12:19:04 launchpad ollama[1519]: time=2025-01-12T12:19:04.328-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 12 12:19:04 launchpad ollama[1519]: time=2025-01-12T12:19:04.329-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 12 12:19:04 launchpad ollama[1519]: time=2025-01-12T12:19:04.331-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2431135854/runners
+Jan 12 12:19:07 launchpad ollama[1519]: time=2025-01-12T12:19:07.157-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 12 12:19:07 launchpad ollama[1519]: time=2025-01-12T12:19:07.157-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 12 12:19:07 launchpad ollama[1519]: time=2025-01-12T12:19:07.157-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:19:07 launchpad ollama[1519]: time=2025-01-12T12:19:07.158-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:19:07 launchpad ollama[1519]: time=2025-01-12T12:19:07.158-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:19:08 launchpad ollama[1519]: time=2025-01-12T12:19:08.873-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Jan 12 12:19:53 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 12 12:19:53 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 12 12:19:53 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 12 12:19:53 launchpad systemd[1]: ollama.service: Consumed 5.135s CPU time, 787M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot c1318a295cf74419a0ed2c3b8704e973 --
+Jan 12 12:20:48 launchpad systemd[1]: Starting Server for local large language models...
+Jan 12 12:20:48 launchpad systemd[1]: Started Server for local large language models.
+Jan 12 12:20:48 launchpad ollama[1519]: 2025/01/12 12:20:48 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 12 12:20:48 launchpad ollama[1519]: time=2025-01-12T12:20:48.278-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 12 12:20:48 launchpad ollama[1519]: time=2025-01-12T12:20:48.285-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 12 12:20:48 launchpad ollama[1519]: time=2025-01-12T12:20:48.287-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 12 12:20:48 launchpad ollama[1519]: time=2025-01-12T12:20:48.287-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4073200553/runners
+Jan 12 12:20:51 launchpad ollama[1519]: time=2025-01-12T12:20:51.380-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Jan 12 12:20:51 launchpad ollama[1519]: time=2025-01-12T12:20:51.380-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 12 12:20:51 launchpad ollama[1519]: time=2025-01-12T12:20:51.381-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:20:51 launchpad ollama[1519]: time=2025-01-12T12:20:51.381-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:20:51 launchpad ollama[1519]: time=2025-01-12T12:20:51.381-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 12:20:51 launchpad ollama[1519]: time=2025-01-12T12:20:51.599-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 12 13:23:41 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 12 13:23:42 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 12 13:23:42 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 12 13:23:42 launchpad systemd[1]: ollama.service: Consumed 3.574s CPU time, 787.3M memory peak, 233.9M read from disk, 508.1M written to disk.
+-- Boot a2f47ad6412e4d348d67fd5c54faed9c --
+Jan 12 13:24:16 launchpad systemd[1]: Starting Server for local large language models...
+Jan 12 13:24:16 launchpad systemd[1]: Started Server for local large language models.
+Jan 12 13:24:16 launchpad ollama[1524]: 2025/01/12 13:24:16 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 12 13:24:16 launchpad ollama[1524]: time=2025-01-12T13:24:16.651-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 12 13:24:16 launchpad ollama[1524]: time=2025-01-12T13:24:16.658-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 12 13:24:16 launchpad ollama[1524]: time=2025-01-12T13:24:16.660-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 12 13:24:16 launchpad ollama[1524]: time=2025-01-12T13:24:16.661-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama384067037/runners
+Jan 12 13:24:19 launchpad ollama[1524]: time=2025-01-12T13:24:19.755-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 12 13:24:19 launchpad ollama[1524]: time=2025-01-12T13:24:19.755-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 12 13:24:19 launchpad ollama[1524]: time=2025-01-12T13:24:19.756-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 13:24:19 launchpad ollama[1524]: time=2025-01-12T13:24:19.756-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 13:24:19 launchpad ollama[1524]: time=2025-01-12T13:24:19.756-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 12 13:24:19 launchpad ollama[1524]: time=2025-01-12T13:24:19.983-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 19 11:07:44 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:07:44 | 200 |     820.573µs |       127.0.0.1 | HEAD     "/"
+Jan 19 11:07:44 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:07:44 | 200 |   16.168757ms |       127.0.0.1 | POST     "/api/show"
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.743-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10062856192 required="6.2 GiB"
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.743-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.744-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.745-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45051"
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.746-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.746-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.746-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 19 11:07:44 launchpad ollama[165551]: INFO [main] build info | build=0 commit="unknown" tid="140213763547136" timestamp=1737313664
+Jan 19 11:07:44 launchpad ollama[165551]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140213763547136" timestamp=1737313664 total_threads=16
+Jan 19 11:07:44 launchpad ollama[165551]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45051" tid="140213763547136" timestamp=1737313664
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 19 11:07:44 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 19 11:07:44 launchpad ollama[1524]: time=2025-01-19T11:07:44.997-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 19 11:07:45 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 19 11:07:45 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 19 11:07:45 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 19 11:07:45 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 19 11:07:45 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 19 11:07:50 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 19 11:07:50 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 19 11:07:50 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 19 11:07:50 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 19 11:07:50 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 19 11:07:50 launchpad ollama[165551]: INFO [main] model loaded | tid="140213763547136" timestamp=1737313670
+Jan 19 11:07:50 launchpad ollama[1524]: time=2025-01-19T11:07:50.765-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.02 seconds"
+Jan 19 11:07:50 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:07:50 | 200 |  6.190084904s |       127.0.0.1 | POST     "/api/generate"
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.583-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10127933440 required="6.2 GiB"
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.583-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.583-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.584-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34491"
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.584-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.584-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.584-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 19 11:18:56 launchpad ollama[167260]: INFO [main] build info | build=0 commit="unknown" tid="140517491892224" timestamp=1737314336
+Jan 19 11:18:56 launchpad ollama[167260]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140517491892224" timestamp=1737314336 total_threads=16
+Jan 19 11:18:56 launchpad ollama[167260]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34491" tid="140517491892224" timestamp=1737314336
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 19 11:18:56 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 19 11:18:56 launchpad ollama[1524]: time=2025-01-19T11:18:56.836-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 19 11:18:56 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 19 11:18:56 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 19 11:18:56 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 19 11:18:56 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 19 11:18:56 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 19 11:18:57 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 19 11:18:57 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 19 11:18:57 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 19 11:18:57 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 19 11:18:57 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 19 11:18:57 launchpad ollama[167260]: INFO [main] model loaded | tid="140517491892224" timestamp=1737314337
+Jan 19 11:18:57 launchpad ollama[1524]: time=2025-01-19T11:18:57.589-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Jan 19 11:19:04 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:19:04 | 200 |  8.000557174s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 11:23:41 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:23:41 | 200 |  4.999027767s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.067-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10127933440 required="6.2 GiB"
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.067-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.067-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.069-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43393"
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.069-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.069-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.069-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 19 11:29:50 launchpad ollama[168779]: INFO [main] build info | build=0 commit="unknown" tid="140039068495872" timestamp=1737314990
+Jan 19 11:29:50 launchpad ollama[168779]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140039068495872" timestamp=1737314990 total_threads=16
+Jan 19 11:29:50 launchpad ollama[168779]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43393" tid="140039068495872" timestamp=1737314990
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 19 11:29:50 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 19 11:29:50 launchpad ollama[1524]: time=2025-01-19T11:29:50.320-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 19 11:29:50 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 19 11:29:50 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 19 11:29:50 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 19 11:29:50 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 19 11:29:50 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 19 11:29:50 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 19 11:29:51 launchpad ollama[168779]: INFO [main] model loaded | tid="140039068495872" timestamp=1737314991
+Jan 19 11:29:51 launchpad ollama[1524]: time=2025-01-19T11:29:51.073-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Jan 19 11:30:00 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:30:00 | 200 | 10.605879085s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 11:34:43 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:34:43 | 200 |  7.165432862s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 11:39:18 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:39:18 | 200 |  4.485852787s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 11:49:12 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:49:12 | 200 |      15.965µs |       127.0.0.1 | HEAD     "/"
+Jan 19 11:49:12 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:49:12 | 200 |    7.696369ms |       127.0.0.1 | POST     "/api/show"
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.172-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.325-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9993977856 required="9.2 GiB"
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.325-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.325-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.327-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 45113"
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.327-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.327-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.327-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 19 11:49:12 launchpad ollama[171728]: INFO [main] build info | build=0 commit="unknown" tid="140310543175680" timestamp=1737316152
+Jan 19 11:49:12 launchpad ollama[171728]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140310543175680" timestamp=1737316152 total_threads=16
+Jan 19 11:49:12 launchpad ollama[171728]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45113" tid="140310543175680" timestamp=1737316152
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - type  f32:   81 tensors
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - type q4_0:  281 tensors
+Jan 19 11:49:12 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 3
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V2
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: vocab type       = SPM
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 32016
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 16384
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 5120
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 40
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_head           = 40
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 40
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 1
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 13824
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 1000000.0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: model type       = 13B
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: model params     = 13.02 B
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: general.name     = codellama
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 1 ''
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 2 ''
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: UNK token        = 0 ''
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: SUF token        = 32008 '▁'
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: MID token        = 32009 '▁'
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 32010 '▁'
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_print_meta: max token length = 48
+Jan 19 11:49:12 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 19 11:49:12 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 19 11:49:12 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 19 11:49:12 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 19 11:49:12 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Jan 19 11:49:12 launchpad ollama[1524]: time=2025-01-19T11:49:12.578-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 19 11:49:19 launchpad ollama[1524]: llm_load_tensors: offloading 40 repeating layers to GPU
+Jan 19 11:49:19 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 19 11:49:19 launchpad ollama[1524]: llm_load_tensors: offloaded 41/41 layers to GPU
+Jan 19 11:49:19 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Jan 19 11:49:19 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 2048
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 1000000.0
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 19 11:49:20 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1286
+Jan 19 11:49:20 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 19 11:49:20 launchpad ollama[171728]: INFO [main] model loaded | tid="140310543175680" timestamp=1737316160
+Jan 19 11:49:20 launchpad ollama[1524]: time=2025-01-19T11:49:20.851-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.52 seconds"
+Jan 19 11:49:20 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:49:20 | 200 |  8.682301556s |       127.0.0.1 | POST     "/api/generate"
+Jan 19 11:50:10 launchpad ollama[1524]: time=2025-01-19T11:50:10.653-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 19 11:50:23 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:50:23 | 200 | 13.275953621s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 11:52:24 launchpad ollama[1524]: time=2025-01-19T11:52:24.845-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Jan 19 11:52:27 launchpad ollama[1524]: [GIN] 2025/01/19 - 11:52:27 | 200 |  2.336261304s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 12:16:49 launchpad ollama[1524]: [GIN] 2025/01/19 - 12:16:49 | 200 |       15.87µs |       127.0.0.1 | HEAD     "/"
+Jan 19 12:16:49 launchpad ollama[1524]: [GIN] 2025/01/19 - 12:16:49 | 200 |   12.606595ms |       127.0.0.1 | POST     "/api/show"
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.750-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9968549888 required="6.2 GiB"
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.750-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.750-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.752-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38815"
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.752-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.752-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 19 12:16:49 launchpad ollama[1524]: time=2025-01-19T12:16:49.752-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 19 12:16:49 launchpad ollama[175994]: INFO [main] build info | build=0 commit="unknown" tid="140526793773056" timestamp=1737317809
+Jan 19 12:16:49 launchpad ollama[175994]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140526793773056" timestamp=1737317809 total_threads=16
+Jan 19 12:16:49 launchpad ollama[175994]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38815" tid="140526793773056" timestamp=1737317809
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 19 12:16:49 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 19 12:16:49 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 19 12:16:50 launchpad ollama[1524]: time=2025-01-19T12:16:50.003-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 19 12:16:50 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 19 12:16:50 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 19 12:16:50 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 19 12:16:50 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 19 12:16:50 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 19 12:16:50 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 19 12:16:50 launchpad ollama[175994]: INFO [main] model loaded | tid="140526793773056" timestamp=1737317810
+Jan 19 12:16:50 launchpad ollama[1524]: time=2025-01-19T12:16:50.755-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Jan 19 12:16:50 launchpad ollama[1524]: [GIN] 2025/01/19 - 12:16:50 | 200 |   1.17554757s |       127.0.0.1 | POST     "/api/generate"
+Jan 19 12:19:01 launchpad ollama[1524]: [GIN] 2025/01/19 - 12:19:01 | 200 |  313.152684ms |       127.0.0.1 | POST     "/api/chat"
+Jan 19 12:20:16 launchpad ollama[1524]: [GIN] 2025/01/19 - 12:20:16 | 200 |  6.505974129s |       127.0.0.1 | POST     "/api/chat"
+Jan 19 12:24:28 launchpad ollama[1524]: [GIN] 2025/01/19 - 12:24:28 | 200 |  538.875114ms |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:32:50 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:32:50 | 200 |      17.302µs |       127.0.0.1 | HEAD     "/"
+Jan 20 16:32:50 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:32:50 | 200 |   12.480656ms |       127.0.0.1 | POST     "/api/show"
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.044-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9720692736 required="6.2 GiB"
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.044-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.044-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.046-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34745"
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.046-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.046-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.046-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 20 16:32:51 launchpad ollama[300296]: INFO [main] build info | build=0 commit="unknown" tid="140111006040064" timestamp=1737419571
+Jan 20 16:32:51 launchpad ollama[300296]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140111006040064" timestamp=1737419571 total_threads=16
+Jan 20 16:32:51 launchpad ollama[300296]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34745" tid="140111006040064" timestamp=1737419571
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 20 16:32:51 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 20 16:32:51 launchpad ollama[1524]: time=2025-01-20T16:32:51.297-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 20 16:32:51 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 20 16:32:51 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 20 16:32:51 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 20 16:32:51 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 20 16:32:51 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 20 16:32:51 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 20 16:32:52 launchpad ollama[300296]: INFO [main] model loaded | tid="140111006040064" timestamp=1737419572
+Jan 20 16:32:52 launchpad ollama[1524]: time=2025-01-20T16:32:52.051-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Jan 20 16:32:52 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:32:52 | 200 |   1.17586167s |       127.0.0.1 | POST     "/api/generate"
+Jan 20 16:35:58 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:35:58 | 200 |  6.241917128s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:40:34 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:40:34 | 200 |  4.398668688s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:41:03 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:41:03 | 200 |  2.152900794s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:42:57 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:42:57 | 200 |  2.608396964s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:44:47 launchpad ollama[300296]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1388 n_keep=24 n_left=2024 n_shift=1012 tid="140111006040064" timestamp=1737420287
+Jan 20 16:44:54 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:44:54 | 200 |  6.853014312s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:46:43 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:46:43 | 200 |  2.143516567s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:50:00 launchpad ollama[300296]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1012 n_keep=24 n_left=2024 n_shift=1012 tid="140111006040064" timestamp=1737420600
+Jan 20 16:50:02 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:50:02 | 200 |  2.454058788s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:52:31 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:52:31 | 200 |  3.326791285s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:54:15 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:54:15 | 200 |  4.370832569s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:55:38 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:55:38 | 200 |  1.904041014s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:56:10 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:56:10 | 200 |  1.902319388s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:56:42 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:56:42 | 200 |   2.27399481s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 16:58:50 launchpad ollama[1524]: [GIN] 2025/01/20 - 16:58:50 | 200 |  3.696276704s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:00:53 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:00:53 | 200 |  5.035283596s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:02:18 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:02:18 | 200 |  5.681649967s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:03:59 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:03:59 | 200 |  3.657473484s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:05:45 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:05:45 | 200 |   5.40724379s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:08:35 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:08:35 | 200 |  5.686849969s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:09:56 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:09:56 | 200 |    5.3097907s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:11:34 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:11:34 | 200 |  5.785954266s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:13:56 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:13:56 | 200 |  5.040395188s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:15:20 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:15:20 | 200 |  4.661986849s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:17:27 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:17:27 | 200 |  6.319957332s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:18:20 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:18:20 | 200 |  5.275236009s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:22:08 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:22:08 | 200 |  4.188055837s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:25:16 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:25:16 | 200 |  9.162616578s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:27:24 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:27:24 | 200 |  7.075603632s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:30:07 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:30:07 | 200 |  6.661916832s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:33:17 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:33:17 | 200 |  6.645453646s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:35:35 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:35:35 | 200 |  5.772994211s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:38:54 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:38:54 | 200 |  6.114248872s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:40:18 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:40:18 | 200 |  4.767655427s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 17:44:51 launchpad ollama[1524]: [GIN] 2025/01/20 - 17:44:51 | 200 |  7.625060365s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.031-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9912516608 required="6.2 GiB"
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.031-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.032-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.033-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33273"
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.033-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.033-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.033-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 20 18:19:48 launchpad ollama[316038]: INFO [main] build info | build=0 commit="unknown" tid="140283534491648" timestamp=1737425988
+Jan 20 18:19:48 launchpad ollama[316038]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140283534491648" timestamp=1737425988 total_threads=16
+Jan 20 18:19:48 launchpad ollama[316038]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33273" tid="140283534491648" timestamp=1737425988
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 20 18:19:48 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 20 18:19:48 launchpad ollama[1524]: time=2025-01-20T18:19:48.284-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 20 18:19:48 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 20 18:19:48 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 20 18:19:48 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 20 18:19:48 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 20 18:19:48 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 20 18:19:48 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 20 18:19:49 launchpad ollama[316038]: INFO [main] model loaded | tid="140283534491648" timestamp=1737425989
+Jan 20 18:19:49 launchpad ollama[1524]: time=2025-01-20T18:19:49.037-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Jan 20 18:19:55 launchpad ollama[1524]: [GIN] 2025/01/20 - 18:19:55 | 200 |  8.056079776s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 18:21:09 launchpad ollama[316038]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="140283534491648" timestamp=1737426069
+Jan 20 18:21:13 launchpad ollama[1524]: [GIN] 2025/01/20 - 18:21:13 | 200 |  4.941487318s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 18:21:58 launchpad ollama[316038]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="140283534491648" timestamp=1737426118
+Jan 20 18:22:02 launchpad ollama[1524]: [GIN] 2025/01/20 - 18:22:02 | 200 |  4.487143889s |       127.0.0.1 | POST     "/api/chat"
+Jan 20 18:23:07 launchpad ollama[1524]: [GIN] 2025/01/20 - 18:23:07 | 200 |  7.047716353s |       127.0.0.1 | POST     "/api/chat"
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.004-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9858056192 required="6.2 GiB"
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.004-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.004-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.005-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama384067037/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46803"
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.005-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.005-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.005-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Jan 22 10:41:59 launchpad ollama[330552]: INFO [main] build info | build=0 commit="unknown" tid="140409680281600" timestamp=1737571319
+Jan 22 10:41:59 launchpad ollama[330552]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140409680281600" timestamp=1737571319 total_threads=16
+Jan 22 10:41:59 launchpad ollama[330552]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46803" tid="140409680281600" timestamp=1737571319
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - type  f32:   65 tensors
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - type q4_0:  225 tensors
+Jan 22 10:41:59 launchpad ollama[1524]: llama_model_loader: - type q6_K:    1 tensors
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_vocab: special tokens cache size = 256
+Jan 22 10:41:59 launchpad ollama[1524]: time=2025-01-22T10:41:59.256-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: format           = GGUF V3 (latest)
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: arch             = llama
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: vocab type       = BPE
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_vocab          = 128256
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_merges         = 280147
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: vocab_only       = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_ctx_train      = 8192
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_embd           = 4096
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_layer          = 32
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_head           = 32
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_head_kv        = 8
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_rot            = 128
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_swa            = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_k    = 128
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_embd_head_v    = 128
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_gqa            = 4
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_ff             = 14336
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_expert         = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_expert_used    = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: causal attn      = 1
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: pooling type     = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: rope type        = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: rope scaling     = linear
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: freq_base_train  = 500000.0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: freq_scale_train = 1
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: rope_finetuned   = unknown
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: ssm_d_conv       = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: ssm_d_inner      = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: ssm_d_state      = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_rank      = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: model type       = 8B
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: model ftype      = Q4_0
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: model params     = 8.03 B
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: LF token         = 128 'Ä'
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_print_meta: max token length = 256
+Jan 22 10:41:59 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Jan 22 10:41:59 launchpad ollama[1524]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Jan 22 10:41:59 launchpad ollama[1524]: ggml_cuda_init: found 1 CUDA devices:
+Jan 22 10:41:59 launchpad ollama[1524]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_tensors: offloading 32 repeating layers to GPU
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_tensors: offloading non-repeating layers to GPU
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_tensors: offloaded 33/33 layers to GPU
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: n_ctx      = 8192
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: n_batch    = 512
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: n_ubatch   = 512
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: flash_attn = 0
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: freq_base  = 500000.0
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: freq_scale = 1
+Jan 22 10:41:59 launchpad ollama[1524]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: graph nodes  = 1030
+Jan 22 10:41:59 launchpad ollama[1524]: llama_new_context_with_model: graph splits = 2
+Jan 22 10:41:59 launchpad ollama[330552]: INFO [main] model loaded | tid="140409680281600" timestamp=1737571319
+Jan 22 10:42:00 launchpad ollama[1524]: time=2025-01-22T10:42:00.009-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Jan 22 10:42:05 launchpad ollama[1524]: [GIN] 2025/01/22 - 10:42:05 | 200 |  6.501525919s |       127.0.0.1 | POST     "/api/chat"
+Jan 28 08:29:33 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 28 08:29:33 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 28 08:29:33 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 28 08:29:34 launchpad systemd[1]: ollama.service: Consumed 4min 7.156s CPU time, 12.4G memory peak, 11.4G read from disk, 508.1M written to disk, 9.8M incoming IP traffic, 9.5M outgoing IP traffic.
+-- Boot db9ee19690e0445f98acd4e27cfa66df --
+Jan 28 08:30:06 launchpad systemd[1]: Starting Server for local large language models...
+Jan 28 08:30:06 launchpad systemd[1]: Started Server for local large language models.
+Jan 28 08:30:06 launchpad ollama[1526]: 2025/01/28 08:30:06 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 28 08:30:06 launchpad ollama[1526]: time=2025-01-28T08:30:06.439-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 28 08:30:06 launchpad ollama[1526]: time=2025-01-28T08:30:06.444-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 28 08:30:06 launchpad ollama[1526]: time=2025-01-28T08:30:06.446-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 28 08:30:06 launchpad ollama[1526]: time=2025-01-28T08:30:06.447-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3066321188/runners
+Jan 28 08:30:09 launchpad ollama[1526]: time=2025-01-28T08:30:09.390-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 28 08:30:09 launchpad ollama[1526]: time=2025-01-28T08:30:09.391-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 28 08:30:09 launchpad ollama[1526]: time=2025-01-28T08:30:09.391-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 28 08:30:09 launchpad ollama[1526]: time=2025-01-28T08:30:09.391-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 28 08:30:09 launchpad ollama[1526]: time=2025-01-28T08:30:09.391-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 28 08:30:09 launchpad ollama[1526]: time=2025-01-28T08:30:09.630-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Jan 29 09:51:03 launchpad systemd[1]: Stopping Server for local large language models...
+Jan 29 09:51:03 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Jan 29 09:51:03 launchpad systemd[1]: Stopped Server for local large language models.
+Jan 29 09:51:03 launchpad systemd[1]: ollama.service: Consumed 3.768s CPU time, 786.7M memory peak, 233.5M read from disk, 508.1M written to disk.
+-- Boot 452893ba9f734cdb848d42607b4ad7b8 --
+Jan 29 09:51:39 launchpad systemd[1]: Starting Server for local large language models...
+Jan 29 09:51:39 launchpad systemd[1]: Started Server for local large language models.
+Jan 29 09:51:39 launchpad ollama[1542]: 2025/01/29 09:51:39 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Jan 29 09:51:39 launchpad ollama[1542]: time=2025-01-29T09:51:39.538-08:00 level=INFO source=images.go:753 msg="total blobs: 18"
+Jan 29 09:51:39 launchpad ollama[1542]: time=2025-01-29T09:51:39.546-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Jan 29 09:51:39 launchpad ollama[1542]: time=2025-01-29T09:51:39.548-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Jan 29 09:51:39 launchpad ollama[1542]: time=2025-01-29T09:51:39.550-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1578890084/runners
+Jan 29 09:51:42 launchpad ollama[1542]: time=2025-01-29T09:51:42.729-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Jan 29 09:51:42 launchpad ollama[1542]: time=2025-01-29T09:51:42.729-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Jan 29 09:51:42 launchpad ollama[1542]: time=2025-01-29T09:51:42.730-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 29 09:51:42 launchpad ollama[1542]: time=2025-01-29T09:51:42.730-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 29 09:51:42 launchpad ollama[1542]: time=2025-01-29T09:51:42.730-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Jan 29 09:51:42 launchpad ollama[1542]: time=2025-01-29T09:51:42.936-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 01 09:20:48 launchpad ollama[1542]: [GIN] 2025/02/01 - 09:20:48 | 200 |     782.496µs |       127.0.0.1 | HEAD     "/"
+Feb 01 09:20:48 launchpad ollama[1542]: [GIN] 2025/02/01 - 09:20:48 | 200 |    6.904482ms |       127.0.0.1 | POST     "/api/show"
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.070-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.241-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10052370432 required="9.2 GiB"
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.241-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.242-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.243-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1578890084/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 34129"
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.244-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.244-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.244-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 01 09:20:48 launchpad ollama[323448]: INFO [main] build info | build=0 commit="unknown" tid="139747224600576" timestamp=1738430448
+Feb 01 09:20:48 launchpad ollama[323448]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139747224600576" timestamp=1738430448 total_threads=16
+Feb 01 09:20:48 launchpad ollama[323448]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34129" tid="139747224600576" timestamp=1738430448
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - type  f32:   81 tensors
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - type q4_0:  281 tensors
+Feb 01 09:20:48 launchpad ollama[1542]: llama_model_loader: - type q6_K:    1 tensors
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_vocab: special tokens cache size = 3
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: format           = GGUF V2
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: arch             = llama
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: vocab type       = SPM
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_vocab          = 32016
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_merges         = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: vocab_only       = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_ctx_train      = 16384
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_embd           = 5120
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_layer          = 40
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_head           = 40
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_head_kv        = 40
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_rot            = 128
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_swa            = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_gqa            = 1
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_ff             = 13824
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_expert         = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_expert_used    = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: causal attn      = 1
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: pooling type     = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: rope type        = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: rope scaling     = linear
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: freq_base_train  = 1000000.0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: freq_scale_train = 1
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: ssm_d_state      = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: model type       = 13B
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: model ftype      = Q4_0
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: model params     = 13.02 B
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: general.name     = codellama
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: BOS token        = 1 ''
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: EOS token        = 2 ''
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: UNK token        = 0 ''
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: SUF token        = 32008 '▁'
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: MID token        = 32009 '▁'
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: EOT token        = 32010 '▁'
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_print_meta: max token length = 48
+Feb 01 09:20:48 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 01 09:20:48 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 01 09:20:48 launchpad ollama[1542]: ggml_cuda_init: found 1 CUDA devices:
+Feb 01 09:20:48 launchpad ollama[1542]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 01 09:20:48 launchpad ollama[1542]: time=2025-02-01T09:20:48.495-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 01 09:20:48 launchpad ollama[1542]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Feb 01 09:21:19 launchpad ollama[1542]: llm_load_tensors: offloading 40 repeating layers to GPU
+Feb 01 09:21:19 launchpad ollama[1542]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 01 09:21:19 launchpad ollama[1542]: llm_load_tensors: offloaded 41/41 layers to GPU
+Feb 01 09:21:19 launchpad ollama[1542]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Feb 01 09:21:19 launchpad ollama[1542]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: n_ctx      = 2048
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: n_batch    = 512
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: n_ubatch   = 512
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: flash_attn = 0
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: freq_base  = 1000000.0
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: freq_scale = 1
+Feb 01 09:21:20 launchpad ollama[1542]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: graph nodes  = 1286
+Feb 01 09:21:20 launchpad ollama[1542]: llama_new_context_with_model: graph splits = 2
+Feb 01 09:21:20 launchpad ollama[323448]: INFO [main] model loaded | tid="139747224600576" timestamp=1738430480
+Feb 01 09:21:20 launchpad ollama[1542]: time=2025-02-01T09:21:20.835-08:00 level=INFO source=server.go:626 msg="llama runner started in 32.59 seconds"
+Feb 01 09:21:20 launchpad ollama[1542]: [GIN] 2025/02/01 - 09:21:20 | 200 | 32.768836167s |       127.0.0.1 | POST     "/api/generate"
+Feb 01 09:26:20 launchpad ollama[1542]: time=2025-02-01T09:26:20.183-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 01 09:26:31 launchpad ollama[1542]: [GIN] 2025/02/01 - 09:26:31 | 200 | 11.029840079s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 10:39:18 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:39:18 | 200 |      21.515µs |       127.0.0.1 | HEAD     "/"
+Feb 01 10:39:18 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:39:18 | 200 |   18.915561ms |       127.0.0.1 | POST     "/api/show"
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.250-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9942007808 required="6.2 GiB"
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.251-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.251-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.252-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1578890084/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34429"
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.252-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.252-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.253-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 01 10:39:18 launchpad ollama[336212]: INFO [main] build info | build=0 commit="unknown" tid="139697006374912" timestamp=1738435158
+Feb 01 10:39:18 launchpad ollama[336212]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139697006374912" timestamp=1738435158 total_threads=16
+Feb 01 10:39:18 launchpad ollama[336212]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34429" tid="139697006374912" timestamp=1738435158
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - type  f32:   65 tensors
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - type q4_0:  225 tensors
+Feb 01 10:39:18 launchpad ollama[1542]: llama_model_loader: - type q6_K:    1 tensors
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_vocab: special tokens cache size = 256
+Feb 01 10:39:18 launchpad ollama[1542]: time=2025-02-01T10:39:18.504-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: arch             = llama
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: vocab type       = BPE
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_vocab          = 128256
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_merges         = 280147
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: vocab_only       = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_embd           = 4096
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_layer          = 32
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_head           = 32
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_head_kv        = 8
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_rot            = 128
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_swa            = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_gqa            = 4
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_ff             = 14336
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_expert         = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_expert_used    = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: causal attn      = 1
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: pooling type     = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: rope type        = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: rope scaling     = linear
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: freq_scale_train = 1
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: ssm_d_state      = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: model type       = 8B
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: model ftype      = Q4_0
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: model params     = 8.03 B
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_print_meta: max token length = 256
+Feb 01 10:39:18 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 01 10:39:18 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 01 10:39:18 launchpad ollama[1542]: ggml_cuda_init: found 1 CUDA devices:
+Feb 01 10:39:18 launchpad ollama[1542]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 01 10:39:18 launchpad ollama[1542]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 01 10:39:23 launchpad ollama[1542]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 01 10:39:23 launchpad ollama[1542]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 01 10:39:23 launchpad ollama[1542]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 01 10:39:23 launchpad ollama[1542]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 01 10:39:23 launchpad ollama[1542]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: n_ctx      = 8192
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: n_batch    = 512
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: n_ubatch   = 512
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: flash_attn = 0
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: freq_scale = 1
+Feb 01 10:39:24 launchpad ollama[1542]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: graph nodes  = 1030
+Feb 01 10:39:24 launchpad ollama[1542]: llama_new_context_with_model: graph splits = 2
+Feb 01 10:39:24 launchpad ollama[336212]: INFO [main] model loaded | tid="139697006374912" timestamp=1738435164
+Feb 01 10:39:24 launchpad ollama[1542]: time=2025-02-01T10:39:24.271-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.02 seconds"
+Feb 01 10:39:24 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:39:24 | 200 |  6.194738756s |       127.0.0.1 | POST     "/api/generate"
+Feb 01 10:40:25 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:40:25 | 200 |  7.787552389s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 10:42:36 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:42:36 | 200 |  9.552961921s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 10:47:09 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:47:09 | 200 |  8.634577406s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 10:49:14 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:49:14 | 200 |  4.965052598s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 10:52:17 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:52:17 | 200 |  7.209837514s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 10:54:43 launchpad ollama[1542]: [GIN] 2025/02/01 - 10:54:43 | 200 |  8.020455569s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.991-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9975562240 required="6.2 GiB"
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.991-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.992-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.993-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1578890084/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38471"
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.993-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.993-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 01 11:10:52 launchpad ollama[1542]: time=2025-02-01T11:10:52.993-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 01 11:10:53 launchpad ollama[340855]: INFO [main] build info | build=0 commit="unknown" tid="139931340832768" timestamp=1738437053
+Feb 01 11:10:53 launchpad ollama[340855]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139931340832768" timestamp=1738437053 total_threads=16
+Feb 01 11:10:53 launchpad ollama[340855]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38471" tid="139931340832768" timestamp=1738437053
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - type  f32:   65 tensors
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - type q4_0:  225 tensors
+Feb 01 11:10:53 launchpad ollama[1542]: llama_model_loader: - type q6_K:    1 tensors
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_vocab: special tokens cache size = 256
+Feb 01 11:10:53 launchpad ollama[1542]: time=2025-02-01T11:10:53.244-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: arch             = llama
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: vocab type       = BPE
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_vocab          = 128256
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_merges         = 280147
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: vocab_only       = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_embd           = 4096
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_layer          = 32
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_head           = 32
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_head_kv        = 8
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_rot            = 128
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_swa            = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_gqa            = 4
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_ff             = 14336
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_expert         = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_expert_used    = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: causal attn      = 1
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: pooling type     = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: rope type        = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: rope scaling     = linear
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: freq_scale_train = 1
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: ssm_d_state      = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: model type       = 8B
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: model ftype      = Q4_0
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: model params     = 8.03 B
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_print_meta: max token length = 256
+Feb 01 11:10:53 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 01 11:10:53 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 01 11:10:53 launchpad ollama[1542]: ggml_cuda_init: found 1 CUDA devices:
+Feb 01 11:10:53 launchpad ollama[1542]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: n_ctx      = 8192
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: n_batch    = 512
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: n_ubatch   = 512
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: flash_attn = 0
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: freq_scale = 1
+Feb 01 11:10:53 launchpad ollama[1542]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: graph nodes  = 1030
+Feb 01 11:10:53 launchpad ollama[1542]: llama_new_context_with_model: graph splits = 2
+Feb 01 11:10:53 launchpad ollama[340855]: INFO [main] model loaded | tid="139931340832768" timestamp=1738437053
+Feb 01 11:10:53 launchpad ollama[1542]: time=2025-02-01T11:10:53.997-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 01 11:11:04 launchpad ollama[1542]: [GIN] 2025/02/01 - 11:11:04 | 200 |  11.78116143s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 15:38:29 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:38:29 | 200 |      15.521µs |       127.0.0.1 | HEAD     "/"
+Feb 01 15:38:30 launchpad ollama[1542]: time=2025-02-01T15:38:30.932-08:00 level=INFO source=download.go:175 msg="downloading 5ff0abeeac1d in 16 556 MB part(s)"
+Feb 01 15:44:43 launchpad ollama[1542]: time=2025-02-01T15:44:43.463-08:00 level=INFO source=download.go:175 msg="downloading 22091531faf0 in 1 705 B part(s)"
+Feb 01 15:44:44 launchpad ollama[1542]: time=2025-02-01T15:44:44.912-08:00 level=INFO source=download.go:175 msg="downloading 4bb71764481f in 1 13 KB part(s)"
+Feb 01 15:44:46 launchpad ollama[1542]: time=2025-02-01T15:44:46.178-08:00 level=INFO source=download.go:175 msg="downloading 1c8f573e830c in 1 1.1 KB part(s)"
+Feb 01 15:44:47 launchpad ollama[1542]: time=2025-02-01T15:44:47.491-08:00 level=INFO source=download.go:175 msg="downloading 19f2fb9e8bc6 in 1 32 B part(s)"
+Feb 01 15:44:48 launchpad ollama[1542]: time=2025-02-01T15:44:48.790-08:00 level=INFO source=download.go:175 msg="downloading 34488e453cfe in 1 568 B part(s)"
+Feb 01 15:44:55 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:44:55 | 200 |         6m25s |       127.0.0.1 | POST     "/api/pull"
+Feb 01 15:45:08 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:45:08 | 200 |      15.863µs |       127.0.0.1 | HEAD     "/"
+Feb 01 15:45:08 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:45:08 | 200 |    8.245202ms |       127.0.0.1 | POST     "/api/show"
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.038-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.038-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=28 layers.offload=26 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.6 GiB" memory.required.partial="9.1 GiB" memory.required.kv="540.0 MiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.5 GiB" memory.weights.repeating="8.4 GiB" memory.weights.nonrepeating="164.1 MiB" memory.graph.full="212.0 MiB" memory.graph.partial="376.1 MiB"
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.039-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1578890084/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046 --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 26 --parallel 1 --port 33197"
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.039-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.039-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.040-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 01 15:45:09 launchpad ollama[382206]: INFO [main] build info | build=0 commit="unknown" tid="140044897447936" timestamp=1738453509
+Feb 01 15:45:09 launchpad ollama[382206]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140044897447936" timestamp=1738453509 total_threads=16
+Feb 01 15:45:09 launchpad ollama[382206]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33197" tid="140044897447936" timestamp=1738453509
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: loaded meta data with 38 key-value pairs and 377 tensors from /var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046 (version GGUF V3 (latest))
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   0:                       general.architecture str              = deepseek2
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   1:                               general.name str              = DeepSeek-Coder-V2-Lite-Instruct
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   2:                      deepseek2.block_count u32              = 27
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   3:                   deepseek2.context_length u32              = 163840
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   4:                 deepseek2.embedding_length u32              = 2048
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   5:              deepseek2.feed_forward_length u32              = 10944
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   6:             deepseek2.attention.head_count u32              = 16
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   7:          deepseek2.attention.head_count_kv u32              = 16
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   8:                   deepseek2.rope.freq_base f32              = 10000.000000
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv   9: deepseek2.attention.layer_norm_rms_epsilon f32              = 0.000001
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  10:                deepseek2.expert_used_count u32              = 6
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  12:        deepseek2.leading_dense_block_count u32              = 1
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  13:                       deepseek2.vocab_size u32              = 102400
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  14:           deepseek2.attention.kv_lora_rank u32              = 512
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  15:             deepseek2.attention.key_length u32              = 192
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  16:           deepseek2.attention.value_length u32              = 128
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  17:       deepseek2.expert_feed_forward_length u32              = 1408
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  18:                     deepseek2.expert_count u32              = 64
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  19:              deepseek2.expert_shared_count u32              = 2
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  20:             deepseek2.expert_weights_scale f32              = 1.000000
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  21:             deepseek2.rope.dimension_count u32              = 64
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  22:                deepseek2.rope.scaling.type str              = yarn
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  23:              deepseek2.rope.scaling.factor f32              = 40.000000
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  24: deepseek2.rope.scaling.original_context_length u32              = 4096
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  25: deepseek2.rope.scaling.yarn_log_multiplier f32              = 0.070700
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  26:                       tokenizer.ggml.model str              = gpt2
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  27:                         tokenizer.ggml.pre str              = deepseek-llm
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  28:                      tokenizer.ggml.tokens arr[str,102400]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  29:                  tokenizer.ggml.token_type arr[i32,102400]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  30:                      tokenizer.ggml.merges arr[str,99757]   = ["Ġ Ġ", "Ġ t", "Ġ a", "i n", "h e...
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 100000
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  32:                tokenizer.ggml.eos_token_id u32              = 100001
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  33:            tokenizer.ggml.padding_token_id u32              = 100001
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  34:               tokenizer.ggml.add_bos_token bool             = true
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  35:               tokenizer.ggml.add_eos_token bool             = false
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  36:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - kv  37:               general.quantization_version u32              = 2
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - type  f32:  108 tensors
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - type q4_0:  268 tensors
+Feb 01 15:45:09 launchpad ollama[1542]: llama_model_loader: - type q6_K:    1 tensors
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_vocab: special tokens cache size = 2400
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_vocab: token to piece cache size = 0.6661 MB
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: arch             = deepseek2
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: vocab type       = BPE
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_vocab          = 102400
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_merges         = 99757
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: vocab_only       = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_ctx_train      = 163840
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_embd           = 2048
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_layer          = 27
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_head           = 16
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_head_kv        = 16
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_rot            = 64
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_swa            = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_k    = 192
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_gqa            = 1
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_embd_k_gqa     = 3072
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_embd_v_gqa     = 2048
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-06
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_ff             = 10944
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_expert         = 64
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_expert_used    = 6
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: causal attn      = 1
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: pooling type     = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: rope type        = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: rope scaling     = yarn
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: freq_base_train  = 10000.0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: freq_scale_train = 0.025
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_ctx_orig_yarn  = 4096
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: ssm_d_state      = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: model type       = 16B
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: model ftype      = Q4_0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: model params     = 15.71 B
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: model size       = 8.29 GiB (4.53 BPW)
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: general.name     = DeepSeek-Coder-V2-Lite-Instruct
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: BOS token        = 100000 '<|begin▁of▁sentence|>'
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: EOS token        = 100001 '<|end▁of▁sentence|>'
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: PAD token        = 100001 '<|end▁of▁sentence|>'
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: LF token         = 126 'Ä'
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: max token length = 256
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_layer_dense_lead   = 1
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_lora_q             = 0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_lora_kv            = 512
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_ff_exp             = 1408
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: n_expert_shared      = 2
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: expert_weights_scale = 1.0
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_print_meta: rope_yarn_log_mul    = 0.0707
+Feb 01 15:45:09 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 01 15:45:09 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 01 15:45:09 launchpad ollama[1542]: ggml_cuda_init: found 1 CUDA devices:
+Feb 01 15:45:09 launchpad ollama[1542]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_tensors: ggml ctx size =    0.32 MiB
+Feb 01 15:45:09 launchpad ollama[1542]: time=2025-02-01T15:45:09.485-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_tensors: offloading 26 repeating layers to GPU
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_tensors: offloaded 26/28 layers to GPU
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_tensors:        CPU buffer size =  2222.30 MiB
+Feb 01 15:45:09 launchpad ollama[1542]: llm_load_tensors:      CUDA0 buffer size =  8168.73 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: n_ctx      = 2048
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: n_batch    = 512
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: n_ubatch   = 512
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: flash_attn = 0
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: freq_base  = 10000.0
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: freq_scale = 0.025
+Feb 01 15:45:10 launchpad ollama[1542]: llama_kv_cache_init:  CUDA_Host KV buffer size =    20.00 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_kv_cache_init:      CUDA0 KV buffer size =   520.00 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: KV self size  =  540.00 MiB, K (f16):  324.00 MiB, V (f16):  216.00 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.40 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model:      CUDA0 compute buffer size =   376.06 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: graph nodes  = 1924
+Feb 01 15:45:10 launchpad ollama[1542]: llama_new_context_with_model: graph splits = 16
+Feb 01 15:45:10 launchpad ollama[382206]: INFO [main] model loaded | tid="140044897447936" timestamp=1738453510
+Feb 01 15:45:10 launchpad ollama[1542]: time=2025-02-01T15:45:10.740-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.70 seconds"
+Feb 01 15:45:10 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:45:10 | 200 |  1.868792545s |       127.0.0.1 | POST     "/api/generate"
+Feb 01 15:45:42 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 15:45:43 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:45:43 | 200 |  1.492885919s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 15:47:11 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 15:47:15 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:47:15 | 200 |  4.354633179s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 15:49:58 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 15:50:04 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:50:04 | 200 |  6.751007515s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 15:51:31 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 15:51:42 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:51:42 | 200 | 11.108623634s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 15:54:26 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 15:54:38 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:54:38 | 200 | 12.249143915s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 15:58:10 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 15:58:18 launchpad ollama[1542]: /build/source/llm/llama.cpp/src/llama.cpp:16940: Deepseek2 does not support K-shift
+Feb 01 15:58:21 launchpad ollama[1542]: [GIN] 2025/02/01 - 15:58:21 | 200 | 11.903695652s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 16:03:26 launchpad ollama[1542]: time=2025-02-01T16:03:26.639-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.138885157 model=/var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046
+Feb 01 16:03:26 launchpad ollama[1542]: time=2025-02-01T16:03:26.890-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.389402549 model=/var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046
+Feb 01 16:03:27 launchpad ollama[1542]: time=2025-02-01T16:03:27.140-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.639748416 model=/var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046
+Feb 01 16:32:33 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:32:33 | 200 |      15.859µs |       127.0.0.1 | HEAD     "/"
+Feb 01 16:32:33 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:32:33 | 200 |    8.054865ms |       127.0.0.1 | POST     "/api/show"
+Feb 01 16:32:33 launchpad ollama[1542]: time=2025-02-01T16:32:33.893-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Feb 01 16:32:33 launchpad ollama[1542]: time=2025-02-01T16:32:33.893-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=28 layers.offload=26 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.6 GiB" memory.required.partial="9.1 GiB" memory.required.kv="540.0 MiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.5 GiB" memory.weights.repeating="8.4 GiB" memory.weights.nonrepeating="164.1 MiB" memory.graph.full="212.0 MiB" memory.graph.partial="376.1 MiB"
+Feb 01 16:32:33 launchpad ollama[1542]: time=2025-02-01T16:32:33.894-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1578890084/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046 --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 26 --parallel 1 --port 44777"
+Feb 01 16:32:33 launchpad ollama[1542]: time=2025-02-01T16:32:33.894-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 01 16:32:33 launchpad ollama[1542]: time=2025-02-01T16:32:33.894-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 01 16:32:33 launchpad ollama[1542]: time=2025-02-01T16:32:33.894-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 01 16:32:33 launchpad ollama[420139]: INFO [main] build info | build=0 commit="unknown" tid="140476942221312" timestamp=1738456353
+Feb 01 16:32:33 launchpad ollama[420139]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140476942221312" timestamp=1738456353 total_threads=16
+Feb 01 16:32:33 launchpad ollama[420139]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44777" tid="140476942221312" timestamp=1738456353
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: loaded meta data with 38 key-value pairs and 377 tensors from /var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046 (version GGUF V3 (latest))
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   0:                       general.architecture str              = deepseek2
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   1:                               general.name str              = DeepSeek-Coder-V2-Lite-Instruct
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   2:                      deepseek2.block_count u32              = 27
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   3:                   deepseek2.context_length u32              = 163840
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   4:                 deepseek2.embedding_length u32              = 2048
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   5:              deepseek2.feed_forward_length u32              = 10944
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   6:             deepseek2.attention.head_count u32              = 16
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   7:          deepseek2.attention.head_count_kv u32              = 16
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   8:                   deepseek2.rope.freq_base f32              = 10000.000000
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv   9: deepseek2.attention.layer_norm_rms_epsilon f32              = 0.000001
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  10:                deepseek2.expert_used_count u32              = 6
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  12:        deepseek2.leading_dense_block_count u32              = 1
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  13:                       deepseek2.vocab_size u32              = 102400
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  14:           deepseek2.attention.kv_lora_rank u32              = 512
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  15:             deepseek2.attention.key_length u32              = 192
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  16:           deepseek2.attention.value_length u32              = 128
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  17:       deepseek2.expert_feed_forward_length u32              = 1408
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  18:                     deepseek2.expert_count u32              = 64
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  19:              deepseek2.expert_shared_count u32              = 2
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  20:             deepseek2.expert_weights_scale f32              = 1.000000
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  21:             deepseek2.rope.dimension_count u32              = 64
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  22:                deepseek2.rope.scaling.type str              = yarn
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  23:              deepseek2.rope.scaling.factor f32              = 40.000000
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  24: deepseek2.rope.scaling.original_context_length u32              = 4096
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  25: deepseek2.rope.scaling.yarn_log_multiplier f32              = 0.070700
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  26:                       tokenizer.ggml.model str              = gpt2
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  27:                         tokenizer.ggml.pre str              = deepseek-llm
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  28:                      tokenizer.ggml.tokens arr[str,102400]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  29:                  tokenizer.ggml.token_type arr[i32,102400]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  30:                      tokenizer.ggml.merges arr[str,99757]   = ["Ġ Ġ", "Ġ t", "Ġ a", "i n", "h e...
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  31:                tokenizer.ggml.bos_token_id u32              = 100000
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  32:                tokenizer.ggml.eos_token_id u32              = 100001
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  33:            tokenizer.ggml.padding_token_id u32              = 100001
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  34:               tokenizer.ggml.add_bos_token bool             = true
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  35:               tokenizer.ggml.add_eos_token bool             = false
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  36:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - kv  37:               general.quantization_version u32              = 2
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - type  f32:  108 tensors
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - type q4_0:  268 tensors
+Feb 01 16:32:33 launchpad ollama[1542]: llama_model_loader: - type q6_K:    1 tensors
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_vocab: special tokens cache size = 2400
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_vocab: token to piece cache size = 0.6661 MB
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: arch             = deepseek2
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: vocab type       = BPE
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_vocab          = 102400
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_merges         = 99757
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: vocab_only       = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_ctx_train      = 163840
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_embd           = 2048
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_layer          = 27
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_head           = 16
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_head_kv        = 16
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_rot            = 64
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_swa            = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_k    = 192
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_gqa            = 1
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_embd_k_gqa     = 3072
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_embd_v_gqa     = 2048
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-06
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_ff             = 10944
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_expert         = 64
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_expert_used    = 6
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: causal attn      = 1
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: pooling type     = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: rope type        = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: rope scaling     = yarn
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: freq_base_train  = 10000.0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: freq_scale_train = 0.025
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_ctx_orig_yarn  = 4096
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: ssm_d_state      = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: model type       = 16B
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: model ftype      = Q4_0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: model params     = 15.71 B
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: model size       = 8.29 GiB (4.53 BPW)
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: general.name     = DeepSeek-Coder-V2-Lite-Instruct
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: BOS token        = 100000 '<|begin▁of▁sentence|>'
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: EOS token        = 100001 '<|end▁of▁sentence|>'
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: PAD token        = 100001 '<|end▁of▁sentence|>'
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: LF token         = 126 'Ä'
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: max token length = 256
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_layer_dense_lead   = 1
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_lora_q             = 0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_lora_kv            = 512
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_ff_exp             = 1408
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: n_expert_shared      = 2
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: expert_weights_scale = 1.0
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_print_meta: rope_yarn_log_mul    = 0.0707
+Feb 01 16:32:34 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 01 16:32:34 launchpad ollama[1542]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 01 16:32:34 launchpad ollama[1542]: ggml_cuda_init: found 1 CUDA devices:
+Feb 01 16:32:34 launchpad ollama[1542]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_tensors: ggml ctx size =    0.32 MiB
+Feb 01 16:32:34 launchpad ollama[1542]: time=2025-02-01T16:32:34.346-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server not responding"
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_tensors: offloading 26 repeating layers to GPU
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_tensors: offloaded 26/28 layers to GPU
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_tensors:        CPU buffer size =  2222.30 MiB
+Feb 01 16:32:34 launchpad ollama[1542]: llm_load_tensors:      CUDA0 buffer size =  8168.73 MiB
+Feb 01 16:32:34 launchpad ollama[1542]: time=2025-02-01T16:32:34.598-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: n_ctx      = 2048
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: n_batch    = 512
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: n_ubatch   = 512
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: flash_attn = 0
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: freq_base  = 10000.0
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: freq_scale = 0.025
+Feb 01 16:32:35 launchpad ollama[1542]: llama_kv_cache_init:  CUDA_Host KV buffer size =    20.00 MiB
+Feb 01 16:32:35 launchpad ollama[1542]: llama_kv_cache_init:      CUDA0 KV buffer size =   520.00 MiB
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: KV self size  =  540.00 MiB, K (f16):  324.00 MiB, V (f16):  216.00 MiB
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.40 MiB
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model:      CUDA0 compute buffer size =   376.06 MiB
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: graph nodes  = 1924
+Feb 01 16:32:35 launchpad ollama[1542]: llama_new_context_with_model: graph splits = 16
+Feb 01 16:32:35 launchpad ollama[420139]: INFO [main] model loaded | tid="140476942221312" timestamp=1738456355
+Feb 01 16:32:35 launchpad ollama[1542]: time=2025-02-01T16:32:35.601-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.71 seconds"
+Feb 01 16:32:35 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:32:35 | 200 |  1.876938441s |       127.0.0.1 | POST     "/api/generate"
+Feb 01 16:32:50 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 16:33:00 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:33:00 | 200 | 10.085431345s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 16:37:37 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 16:37:48 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:37:48 | 200 | 11.100581045s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 16:38:42 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 16:38:54 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:38:54 | 200 | 11.827158041s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 16:41:14 launchpad ollama[1542]: check_double_bos_eos: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
+Feb 01 16:41:20 launchpad ollama[1542]: /build/source/llm/llama.cpp/src/llama.cpp:16940: Deepseek2 does not support K-shift
+Feb 01 16:41:23 launchpad ollama[1542]: [GIN] 2025/02/01 - 16:41:23 | 200 |  9.357358628s |       127.0.0.1 | POST     "/api/chat"
+Feb 01 16:46:28 launchpad ollama[1542]: time=2025-02-01T16:46:28.886-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.13673044 model=/var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046
+Feb 01 16:46:29 launchpad ollama[1542]: time=2025-02-01T16:46:29.135-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.385882612 model=/var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046
+Feb 01 16:46:29 launchpad ollama[1542]: time=2025-02-01T16:46:29.385-08:00 level=WARN source=sched.go:646 msg="gpu VRAM usage didn't recover within timeout" seconds=5.63581397 model=/var/lib/ollama/models/blobs/sha256-5ff0abeeac1d2dbdd5455c0b49ba3b29a9ce3c1fb181b2eef2e948689d55d046
+Feb 04 17:44:07 launchpad ollama[1542]: [GIN] 2025/02/04 - 17:44:07 | 200 |      17.932µs |       127.0.0.1 | HEAD     "/"
+Feb 04 17:44:07 launchpad ollama[1542]: [GIN] 2025/02/04 - 17:44:07 | 200 |      50.919µs |       127.0.0.1 | GET      "/api/ps"
+Feb 04 17:44:19 launchpad ollama[1542]: [GIN] 2025/02/04 - 17:44:19 | 200 |      15.157µs |       127.0.0.1 | HEAD     "/"
+Feb 04 17:44:19 launchpad ollama[1542]: [GIN] 2025/02/04 - 17:44:19 | 200 |    2.326055ms |       127.0.0.1 | GET      "/api/tags"
+Feb 07 12:14:12 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 07 12:14:12 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 07 12:14:12 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 07 12:14:12 launchpad systemd[1]: ollama.service: Consumed 10min 5.028s CPU time, 21G memory peak, 11.5G read from disk, 8.7G written to disk, 8.4G incoming IP traffic, 153.8M outgoing IP traffic.
+-- Boot 318701f7efd84065b4710ac959eb8901 --
+Feb 07 12:14:47 launchpad systemd[1]: Starting Server for local large language models...
+Feb 07 12:14:47 launchpad systemd[1]: Started Server for local large language models.
+Feb 07 12:14:47 launchpad ollama[1560]: 2025/02/07 12:14:47 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 07 12:14:47 launchpad ollama[1560]: time=2025-02-07T12:14:47.526-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 07 12:14:47 launchpad ollama[1560]: time=2025-02-07T12:14:47.532-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 07 12:14:47 launchpad ollama[1560]: time=2025-02-07T12:14:47.533-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 07 12:14:47 launchpad ollama[1560]: time=2025-02-07T12:14:47.535-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2560886655/runners
+Feb 07 12:14:50 launchpad ollama[1560]: time=2025-02-07T12:14:50.442-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 07 12:14:50 launchpad ollama[1560]: time=2025-02-07T12:14:50.443-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 07 12:14:50 launchpad ollama[1560]: time=2025-02-07T12:14:50.443-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 07 12:14:50 launchpad ollama[1560]: time=2025-02-07T12:14:50.443-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 07 12:14:50 launchpad ollama[1560]: time=2025-02-07T12:14:50.443-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 07 12:14:50 launchpad ollama[1560]: time=2025-02-07T12:14:50.657-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 12 15:30:56 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:30:56 | 200 |     809.745µs |       127.0.0.1 | HEAD     "/"
+Feb 12 15:30:56 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:30:56 | 200 |   16.704582ms |       127.0.0.1 | POST     "/api/show"
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.119-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8824487936 required="6.2 GiB"
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.119-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.8 GiB" free_swap="68.9 GiB"
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.119-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.121-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44055"
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.121-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.121-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.121-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 15:30:57 launchpad ollama[317666]: INFO [main] build info | build=0 commit="unknown" tid="140249299701760" timestamp=1739403057
+Feb 12 15:30:57 launchpad ollama[317666]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140249299701760" timestamp=1739403057 total_threads=16
+Feb 12 15:30:57 launchpad ollama[317666]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44055" tid="140249299701760" timestamp=1739403057
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 15:30:57 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 15:30:57 launchpad ollama[1560]: time=2025-02-12T15:30:57.372-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 15:30:57 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 15:30:57 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 15:30:57 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 15:30:57 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 15:30:57 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 15:31:02 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 15:31:02 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 15:31:02 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 15:31:02 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 15:31:02 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 15:31:03 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 15:31:03 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 15:31:03 launchpad ollama[317666]: INFO [main] model loaded | tid="140249299701760" timestamp=1739403063
+Feb 12 15:31:03 launchpad ollama[1560]: time=2025-02-12T15:31:03.388-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Feb 12 15:31:03 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:31:03 | 200 |   6.45295186s |       127.0.0.1 | POST     "/api/generate"
+Feb 12 15:31:10 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:31:10 | 200 |  806.499181ms |       127.0.0.1 | POST     "/api/chat"
+Feb 12 15:34:37 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:34:37 | 200 |  7.558818056s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 15:38:54 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:38:54 | 200 |   4.47735231s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.543-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8826585088 required="6.2 GiB"
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.543-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.8 GiB" free_swap="68.9 GiB"
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.543-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.544-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43755"
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.544-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.544-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.544-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 15:46:00 launchpad ollama[319731]: INFO [main] build info | build=0 commit="unknown" tid="140564619010048" timestamp=1739403960
+Feb 12 15:46:00 launchpad ollama[319731]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140564619010048" timestamp=1739403960 total_threads=16
+Feb 12 15:46:00 launchpad ollama[319731]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43755" tid="140564619010048" timestamp=1739403960
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 15:46:00 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 15:46:00 launchpad ollama[1560]: time=2025-02-12T15:46:00.795-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 15:46:00 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 15:46:00 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 15:46:00 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 15:46:00 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 15:46:00 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 15:46:01 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 15:46:01 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 15:46:01 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 15:46:01 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 15:46:01 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 15:46:01 launchpad ollama[319731]: INFO [main] model loaded | tid="140564619010048" timestamp=1739403961
+Feb 12 15:46:01 launchpad ollama[1560]: time=2025-02-12T15:46:01.548-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 15:46:11 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:46:11 | 200 | 11.600758256s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.078-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8836022272 required="6.2 GiB"
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.078-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.078-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.079-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46595"
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.079-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.079-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.079-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 15:56:59 launchpad ollama[321250]: INFO [main] build info | build=0 commit="unknown" tid="140409782231040" timestamp=1739404619
+Feb 12 15:56:59 launchpad ollama[321250]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140409782231040" timestamp=1739404619 total_threads=16
+Feb 12 15:56:59 launchpad ollama[321250]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46595" tid="140409782231040" timestamp=1739404619
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 15:56:59 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 15:56:59 launchpad ollama[1560]: time=2025-02-12T15:56:59.330-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 15:56:59 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 15:56:59 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 15:56:59 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 15:56:59 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 15:56:59 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 15:56:59 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 15:56:59 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 15:56:59 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 15:56:59 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 15:56:59 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 15:56:59 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 15:57:00 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 15:57:00 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 15:57:00 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 15:57:00 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 15:57:00 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 15:57:00 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 15:57:00 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 15:57:00 launchpad ollama[321250]: INFO [main] model loaded | tid="140409782231040" timestamp=1739404620
+Feb 12 15:57:00 launchpad ollama[1560]: time=2025-02-12T15:57:00.083-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 15:57:13 launchpad ollama[1560]: [GIN] 2025/02/12 - 15:57:13 | 200 | 14.578612736s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.627-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8836022272 required="6.2 GiB"
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.627-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.627-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.628-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42613"
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.628-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.628-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.629-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 16:07:16 launchpad ollama[322706]: INFO [main] build info | build=0 commit="unknown" tid="139939873157120" timestamp=1739405236
+Feb 12 16:07:16 launchpad ollama[322706]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139939873157120" timestamp=1739405236 total_threads=16
+Feb 12 16:07:16 launchpad ollama[322706]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42613" tid="139939873157120" timestamp=1739405236
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 16:07:16 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 16:07:16 launchpad ollama[1560]: time=2025-02-12T16:07:16.879-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 16:07:16 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 16:07:16 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 16:07:16 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 16:07:16 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 16:07:16 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 16:07:17 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 16:07:17 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 16:07:17 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 16:07:17 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 16:07:17 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 16:07:17 launchpad ollama[322706]: INFO [main] model loaded | tid="139939873157120" timestamp=1739405237
+Feb 12 16:07:17 launchpad ollama[1560]: time=2025-02-12T16:07:17.632-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 16:07:27 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:07:27 | 200 | 10.713501567s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.812-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8789884928 required="6.2 GiB"
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.812-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.812-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.813-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33111"
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.813-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.813-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 16:16:54 launchpad ollama[1560]: time=2025-02-12T16:16:54.814-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 16:16:54 launchpad ollama[324056]: INFO [main] build info | build=0 commit="unknown" tid="140372847271936" timestamp=1739405814
+Feb 12 16:16:54 launchpad ollama[324056]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140372847271936" timestamp=1739405814 total_threads=16
+Feb 12 16:16:54 launchpad ollama[324056]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33111" tid="140372847271936" timestamp=1739405814
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 16:16:54 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 16:16:55 launchpad ollama[1560]: time=2025-02-12T16:16:55.065-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 16:16:55 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 16:16:55 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 16:16:55 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 16:16:55 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 16:16:55 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 16:16:55 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 16:16:55 launchpad ollama[324056]: INFO [main] model loaded | tid="140372847271936" timestamp=1739405815
+Feb 12 16:16:55 launchpad ollama[1560]: time=2025-02-12T16:16:55.817-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 16:17:05 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:17:05 | 200 |  10.89453299s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:19:35 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:19:35 | 200 | 10.435771224s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.045-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8789884928 required="6.2 GiB"
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.045-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.046-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.047-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44181"
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.047-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.047-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.047-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 16:30:44 launchpad ollama[325948]: INFO [main] build info | build=0 commit="unknown" tid="139989269700608" timestamp=1739406644
+Feb 12 16:30:44 launchpad ollama[325948]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139989269700608" timestamp=1739406644 total_threads=16
+Feb 12 16:30:44 launchpad ollama[325948]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44181" tid="139989269700608" timestamp=1739406644
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 16:30:44 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 16:30:44 launchpad ollama[1560]: time=2025-02-12T16:30:44.298-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 16:30:44 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 16:30:44 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 16:30:44 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 16:30:44 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 16:30:44 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 16:30:44 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 16:30:45 launchpad ollama[325948]: INFO [main] model loaded | tid="139989269700608" timestamp=1739406645
+Feb 12 16:30:45 launchpad ollama[1560]: time=2025-02-12T16:30:45.051-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 16:30:53 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:30:53 | 200 |  9.370615161s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:34:50 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:34:50 | 200 | 10.811598588s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.246-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8661434368 required="6.2 GiB"
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.246-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.247-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.247-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45783"
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.248-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.248-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.248-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 16:53:07 launchpad ollama[329337]: INFO [main] build info | build=0 commit="unknown" tid="139917100216320" timestamp=1739407987
+Feb 12 16:53:07 launchpad ollama[329337]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139917100216320" timestamp=1739407987 total_threads=16
+Feb 12 16:53:07 launchpad ollama[329337]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45783" tid="139917100216320" timestamp=1739407987
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 16:53:07 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 16:53:07 launchpad ollama[1560]: time=2025-02-12T16:53:07.499-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 16:53:07 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 16:53:07 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 16:53:07 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 16:53:07 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 16:53:07 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 16:53:08 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 16:53:08 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 16:53:08 launchpad ollama[329337]: INFO [main] model loaded | tid="139917100216320" timestamp=1739407988
+Feb 12 16:53:08 launchpad ollama[1560]: time=2025-02-12T16:53:08.251-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 16:53:15 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:53:15 | 200 |  8.481043146s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 16:56:44 launchpad ollama[1560]: [GIN] 2025/02/12 - 16:56:44 | 200 |  7.666530946s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.154-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8619229184 required="6.2 GiB"
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.154-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.5 GiB" free_swap="68.9 GiB"
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.154-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.155-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40083"
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.156-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.156-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.156-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 17:08:19 launchpad ollama[331594]: INFO [main] build info | build=0 commit="unknown" tid="140119408218112" timestamp=1739408899
+Feb 12 17:08:19 launchpad ollama[331594]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140119408218112" timestamp=1739408899 total_threads=16
+Feb 12 17:08:19 launchpad ollama[331594]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40083" tid="140119408218112" timestamp=1739408899
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 17:08:19 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 17:08:19 launchpad ollama[1560]: time=2025-02-12T17:08:19.407-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 17:08:19 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 17:08:19 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 17:08:19 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 17:08:19 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 17:08:19 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 17:08:20 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 17:08:20 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 17:08:20 launchpad ollama[331594]: INFO [main] model loaded | tid="140119408218112" timestamp=1739408900
+Feb 12 17:08:20 launchpad ollama[1560]: time=2025-02-12T17:08:20.160-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 17:08:27 launchpad ollama[1560]: [GIN] 2025/02/12 - 17:08:27 | 200 |  8.778505461s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 17:11:34 launchpad ollama[1560]: [GIN] 2025/02/12 - 17:11:34 | 200 |  8.016645904s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.166-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8814526464 required="6.2 GiB"
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.166-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.167-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.167-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama2560886655/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43207"
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.168-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.168-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.168-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 12 17:18:39 launchpad ollama[333230]: INFO [main] build info | build=0 commit="unknown" tid="140570797301760" timestamp=1739409519
+Feb 12 17:18:39 launchpad ollama[333230]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140570797301760" timestamp=1739409519 total_threads=16
+Feb 12 17:18:39 launchpad ollama[333230]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43207" tid="140570797301760" timestamp=1739409519
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - type  f32:   65 tensors
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - type q4_0:  225 tensors
+Feb 12 17:18:39 launchpad ollama[1560]: llama_model_loader: - type q6_K:    1 tensors
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_vocab: special tokens cache size = 256
+Feb 12 17:18:39 launchpad ollama[1560]: time=2025-02-12T17:18:39.419-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: arch             = llama
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: vocab type       = BPE
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_vocab          = 128256
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_merges         = 280147
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: vocab_only       = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_embd           = 4096
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_layer          = 32
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_head           = 32
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_head_kv        = 8
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_rot            = 128
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_swa            = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_gqa            = 4
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_ff             = 14336
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_expert         = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_expert_used    = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: causal attn      = 1
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: pooling type     = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: rope type        = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: rope scaling     = linear
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: freq_scale_train = 1
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: ssm_d_state      = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: model type       = 8B
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: model ftype      = Q4_0
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: model params     = 8.03 B
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_print_meta: max token length = 256
+Feb 12 17:18:39 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 12 17:18:39 launchpad ollama[1560]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 12 17:18:39 launchpad ollama[1560]: ggml_cuda_init: found 1 CUDA devices:
+Feb 12 17:18:39 launchpad ollama[1560]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 12 17:18:39 launchpad ollama[1560]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: n_ctx      = 8192
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: n_batch    = 512
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: n_ubatch   = 512
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: flash_attn = 0
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: freq_scale = 1
+Feb 12 17:18:40 launchpad ollama[1560]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: graph nodes  = 1030
+Feb 12 17:18:40 launchpad ollama[1560]: llama_new_context_with_model: graph splits = 2
+Feb 12 17:18:40 launchpad ollama[333230]: INFO [main] model loaded | tid="140570797301760" timestamp=1739409520
+Feb 12 17:18:40 launchpad ollama[1560]: time=2025-02-12T17:18:40.172-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 12 17:18:45 launchpad ollama[1560]: [GIN] 2025/02/12 - 17:18:45 | 200 |  6.309499227s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 17:20:51 launchpad ollama[1560]: [GIN] 2025/02/12 - 17:20:51 | 200 |  2.663260947s |       127.0.0.1 | POST     "/api/chat"
+Feb 12 17:23:00 launchpad ollama[1560]: [GIN] 2025/02/12 - 17:23:00 | 200 |  4.274803171s |       127.0.0.1 | POST     "/api/chat"
+-- Boot 1bc40fd30e944d92a892538849018274 --
+Feb 13 08:19:27 launchpad systemd[1]: Starting Server for local large language models...
+Feb 13 08:19:27 launchpad systemd[1]: Started Server for local large language models.
+Feb 13 08:19:27 launchpad ollama[1566]: 2025/02/13 08:19:27 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 13 08:19:27 launchpad ollama[1566]: time=2025-02-13T08:19:27.325-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 13 08:19:27 launchpad ollama[1566]: time=2025-02-13T08:19:27.331-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 13 08:19:27 launchpad ollama[1566]: time=2025-02-13T08:19:27.333-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 13 08:19:27 launchpad ollama[1566]: time=2025-02-13T08:19:27.333-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama245345955/runners
+Feb 13 08:19:30 launchpad ollama[1566]: time=2025-02-13T08:19:30.275-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 13 08:19:30 launchpad ollama[1566]: time=2025-02-13T08:19:30.276-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 13 08:19:30 launchpad ollama[1566]: time=2025-02-13T08:19:30.276-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 13 08:19:30 launchpad ollama[1566]: time=2025-02-13T08:19:30.276-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 13 08:19:30 launchpad ollama[1566]: time=2025-02-13T08:19:30.276-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 13 08:19:30 launchpad ollama[1566]: time=2025-02-13T08:19:30.489-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.7 GiB" available="11.1 GiB"
+Feb 15 11:52:37 launchpad ollama[1566]: [GIN] 2025/02/15 - 11:52:37 | 200 |     760.424µs |       127.0.0.1 | HEAD     "/"
+Feb 15 11:52:37 launchpad ollama[1566]: [GIN] 2025/02/15 - 11:52:37 | 200 |    7.228879ms |       127.0.0.1 | POST     "/api/show"
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.529-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.666-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.667-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.668-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama245345955/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 35099"
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.668-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.668-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.668-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 15 11:52:37 launchpad ollama[174071]: INFO [main] build info | build=0 commit="unknown" tid="140665951301632" timestamp=1739649157
+Feb 15 11:52:37 launchpad ollama[174071]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140665951301632" timestamp=1739649157 total_threads=16
+Feb 15 11:52:37 launchpad ollama[174071]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35099" tid="140665951301632" timestamp=1739649157
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - type  f32:   81 tensors
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - type q4_0:  281 tensors
+Feb 15 11:52:37 launchpad ollama[1566]: llama_model_loader: - type q6_K:    1 tensors
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_vocab: special tokens cache size = 3
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: format           = GGUF V2
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: arch             = llama
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: vocab type       = SPM
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_vocab          = 32016
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_merges         = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: vocab_only       = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_ctx_train      = 16384
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_embd           = 5120
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_layer          = 40
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_head           = 40
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_head_kv        = 40
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_rot            = 128
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_swa            = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_gqa            = 1
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_ff             = 13824
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_expert         = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_expert_used    = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: causal attn      = 1
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: pooling type     = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: rope type        = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: rope scaling     = linear
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: freq_base_train  = 1000000.0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: freq_scale_train = 1
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: ssm_d_state      = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: model type       = 13B
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: model ftype      = Q4_0
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: model params     = 13.02 B
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: general.name     = codellama
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: BOS token        = 1 ''
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: EOS token        = 2 ''
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: UNK token        = 0 ''
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: SUF token        = 32008 '▁'
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: MID token        = 32009 '▁'
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: EOT token        = 32010 '▁'
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_print_meta: max token length = 48
+Feb 15 11:52:37 launchpad ollama[1566]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 15 11:52:37 launchpad ollama[1566]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 15 11:52:37 launchpad ollama[1566]: ggml_cuda_init: found 1 CUDA devices:
+Feb 15 11:52:37 launchpad ollama[1566]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 15 11:52:37 launchpad ollama[1566]: time=2025-02-15T11:52:37.919-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 15 11:52:37 launchpad ollama[1566]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Feb 15 11:52:45 launchpad ollama[1566]: llm_load_tensors: offloading 39 repeating layers to GPU
+Feb 15 11:52:45 launchpad ollama[1566]: llm_load_tensors: offloaded 39/41 layers to GPU
+Feb 15 11:52:45 launchpad ollama[1566]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Feb 15 11:52:45 launchpad ollama[1566]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: n_ctx      = 2048
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: n_batch    = 512
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: n_ubatch   = 512
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: flash_attn = 0
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: freq_base  = 1000000.0
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: freq_scale = 1
+Feb 15 11:52:46 launchpad ollama[1566]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: graph nodes  = 1286
+Feb 15 11:52:46 launchpad ollama[1566]: llama_new_context_with_model: graph splits = 15
+Feb 15 11:52:46 launchpad ollama[174071]: INFO [main] model loaded | tid="140665951301632" timestamp=1739649166
+Feb 15 11:52:46 launchpad ollama[1566]: time=2025-02-15T11:52:46.442-08:00 level=INFO source=server.go:626 msg="llama runner started in 8.77 seconds"
+Feb 15 11:52:46 launchpad ollama[1566]: [GIN] 2025/02/15 - 11:52:46 | 200 |  8.916506088s |       127.0.0.1 | POST     "/api/generate"
+Feb 15 11:52:54 launchpad ollama[1566]: time=2025-02-15T11:52:54.584-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 11:52:56 launchpad ollama[1566]: [GIN] 2025/02/15 - 11:52:56 | 200 |  1.995319181s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 11:53:41 launchpad ollama[1566]: time=2025-02-15T11:53:41.925-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 11:53:52 launchpad ollama[1566]: [GIN] 2025/02/15 - 11:53:52 | 200 | 10.953237822s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 11:58:46 launchpad ollama[1566]: time=2025-02-15T11:58:46.009-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 11:59:16 launchpad ollama[1566]: [GIN] 2025/02/15 - 11:59:16 | 200 | 30.673746077s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 12:01:31 launchpad ollama[1566]: time=2025-02-15T12:01:31.324-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 12:01:45 launchpad ollama[1566]: [GIN] 2025/02/15 - 12:01:45 | 200 | 14.095009496s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.284-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.441-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.442-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.443-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama245345955/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 35217"
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.443-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.443-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.443-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 15 12:11:29 launchpad ollama[195769]: INFO [main] build info | build=0 commit="unknown" tid="139763539496960" timestamp=1739650289
+Feb 15 12:11:29 launchpad ollama[195769]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139763539496960" timestamp=1739650289 total_threads=16
+Feb 15 12:11:29 launchpad ollama[195769]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35217" tid="139763539496960" timestamp=1739650289
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - type  f32:   81 tensors
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - type q4_0:  281 tensors
+Feb 15 12:11:29 launchpad ollama[1566]: llama_model_loader: - type q6_K:    1 tensors
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_vocab: special tokens cache size = 3
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: format           = GGUF V2
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: arch             = llama
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: vocab type       = SPM
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_vocab          = 32016
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_merges         = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: vocab_only       = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_ctx_train      = 16384
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_embd           = 5120
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_layer          = 40
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_head           = 40
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_head_kv        = 40
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_rot            = 128
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_swa            = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_gqa            = 1
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_ff             = 13824
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_expert         = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_expert_used    = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: causal attn      = 1
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: pooling type     = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: rope type        = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: rope scaling     = linear
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: freq_base_train  = 1000000.0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: freq_scale_train = 1
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: ssm_d_state      = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: model type       = 13B
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: model ftype      = Q4_0
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: model params     = 13.02 B
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: general.name     = codellama
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: BOS token        = 1 ''
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: EOS token        = 2 ''
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: UNK token        = 0 ''
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: SUF token        = 32008 '▁'
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: MID token        = 32009 '▁'
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: EOT token        = 32010 '▁'
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_print_meta: max token length = 48
+Feb 15 12:11:29 launchpad ollama[1566]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 15 12:11:29 launchpad ollama[1566]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 15 12:11:29 launchpad ollama[1566]: ggml_cuda_init: found 1 CUDA devices:
+Feb 15 12:11:29 launchpad ollama[1566]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Feb 15 12:11:29 launchpad ollama[1566]: time=2025-02-15T12:11:29.733-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_tensors: offloading 39 repeating layers to GPU
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_tensors: offloaded 39/41 layers to GPU
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Feb 15 12:11:29 launchpad ollama[1566]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: n_ctx      = 2048
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: n_batch    = 512
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: n_ubatch   = 512
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: flash_attn = 0
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: freq_base  = 1000000.0
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: freq_scale = 1
+Feb 15 12:11:30 launchpad ollama[1566]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: graph nodes  = 1286
+Feb 15 12:11:30 launchpad ollama[1566]: llama_new_context_with_model: graph splits = 15
+Feb 15 12:11:30 launchpad ollama[195769]: INFO [main] model loaded | tid="139763539496960" timestamp=1739650290
+Feb 15 12:11:30 launchpad ollama[1566]: time=2025-02-15T12:11:30.736-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Feb 15 12:11:37 launchpad ollama[1566]: [GIN] 2025/02/15 - 12:11:37 | 200 |  7.908151505s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 12:16:08 launchpad ollama[1566]: time=2025-02-15T12:16:08.876-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 12:16:30 launchpad ollama[1566]: [GIN] 2025/02/15 - 12:16:30 | 200 | 21.938492665s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.070-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.228-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.4 GiB" free_swap="68.9 GiB"
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.228-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.229-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama245345955/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 46093"
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.229-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.229-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.229-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 15 12:58:45 launchpad ollama[210632]: INFO [main] build info | build=0 commit="unknown" tid="140125316128768" timestamp=1739653125
+Feb 15 12:58:45 launchpad ollama[210632]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140125316128768" timestamp=1739653125 total_threads=16
+Feb 15 12:58:45 launchpad ollama[210632]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46093" tid="140125316128768" timestamp=1739653125
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - type  f32:   81 tensors
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - type q4_0:  281 tensors
+Feb 15 12:58:45 launchpad ollama[1566]: llama_model_loader: - type q6_K:    1 tensors
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_vocab: special tokens cache size = 3
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: format           = GGUF V2
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: arch             = llama
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: vocab type       = SPM
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_vocab          = 32016
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_merges         = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: vocab_only       = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_ctx_train      = 16384
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_embd           = 5120
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_layer          = 40
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_head           = 40
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_head_kv        = 40
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_rot            = 128
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_swa            = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_gqa            = 1
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_ff             = 13824
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_expert         = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_expert_used    = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: causal attn      = 1
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: pooling type     = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: rope type        = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: rope scaling     = linear
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: freq_base_train  = 1000000.0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: freq_scale_train = 1
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: ssm_d_state      = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: model type       = 13B
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: model ftype      = Q4_0
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: model params     = 13.02 B
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: general.name     = codellama
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: BOS token        = 1 ''
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: EOS token        = 2 ''
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: UNK token        = 0 ''
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: SUF token        = 32008 '▁'
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: MID token        = 32009 '▁'
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: EOT token        = 32010 '▁'
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_print_meta: max token length = 48
+Feb 15 12:58:45 launchpad ollama[1566]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 15 12:58:45 launchpad ollama[1566]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 15 12:58:45 launchpad ollama[1566]: ggml_cuda_init: found 1 CUDA devices:
+Feb 15 12:58:45 launchpad ollama[1566]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Feb 15 12:58:45 launchpad ollama[1566]: time=2025-02-15T12:58:45.524-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_tensors: offloading 39 repeating layers to GPU
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_tensors: offloaded 39/41 layers to GPU
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Feb 15 12:58:45 launchpad ollama[1566]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: n_ctx      = 2048
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: n_batch    = 512
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: n_ubatch   = 512
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: flash_attn = 0
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: freq_base  = 1000000.0
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: freq_scale = 1
+Feb 15 12:58:46 launchpad ollama[1566]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: graph nodes  = 1286
+Feb 15 12:58:46 launchpad ollama[1566]: llama_new_context_with_model: graph splits = 15
+Feb 15 12:58:46 launchpad ollama[210632]: INFO [main] model loaded | tid="140125316128768" timestamp=1739653126
+Feb 15 12:58:46 launchpad ollama[1566]: time=2025-02-15T12:58:46.527-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Feb 15 12:59:40 launchpad ollama[1566]: [GIN] 2025/02/15 - 12:59:40 | 200 | 55.188965858s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 13:01:42 launchpad ollama[1566]: time=2025-02-15T13:01:42.169-08:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Feb 15 13:03:25 launchpad ollama[1566]: [GIN] 2025/02/15 - 13:03:25 | 200 |         1m43s |       127.0.0.1 | POST     "/api/chat"
+Feb 15 13:53:02 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 15 13:53:02 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 15 13:53:02 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 15 13:53:02 launchpad systemd[1]: ollama.service: Consumed 12min 8.868s CPU time, 8G memory peak, 7.1G read from disk, 508.1M written to disk, 1.8M incoming IP traffic, 2.5M outgoing IP traffic.
+-- Boot e896c194ceef425f8e564ca391b26ad0 --
+Feb 15 13:53:08 launchpad systemd[1]: Starting Server for local large language models...
+Feb 15 13:53:08 launchpad systemd[1]: Started Server for local large language models.
+Feb 15 13:53:08 launchpad ollama[1533]: 2025/02/15 13:53:08 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 15 13:53:08 launchpad ollama[1533]: time=2025-02-15T13:53:08.945-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 15 13:53:08 launchpad ollama[1533]: time=2025-02-15T13:53:08.950-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 15 13:53:08 launchpad ollama[1533]: time=2025-02-15T13:53:08.953-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 15 13:53:08 launchpad ollama[1533]: time=2025-02-15T13:53:08.954-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama382441005/runners
+Feb 15 13:53:11 launchpad ollama[1533]: time=2025-02-15T13:53:11.943-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Feb 15 13:53:11 launchpad ollama[1533]: time=2025-02-15T13:53:11.944-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 15 13:53:11 launchpad ollama[1533]: time=2025-02-15T13:53:11.944-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 15 13:53:11 launchpad ollama[1533]: time=2025-02-15T13:53:11.944-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 15 13:53:11 launchpad ollama[1533]: time=2025-02-15T13:53:11.944-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 15 13:53:12 launchpad ollama[1533]: time=2025-02-15T13:53:12.141-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 15 18:09:05 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 15 18:09:05 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 15 18:09:05 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 15 18:09:05 launchpad systemd[1]: ollama.service: Consumed 3.408s CPU time, 787.6M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot b75c99985a0841129b7af6edd7fb4133 --
+Feb 16 04:02:17 launchpad systemd[1]: Starting Server for local large language models...
+Feb 16 04:02:17 launchpad systemd[1]: Started Server for local large language models.
+Feb 16 04:02:17 launchpad ollama[1540]: 2025/02/16 04:02:17 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 16 04:02:17 launchpad ollama[1540]: time=2025-02-16T04:02:17.372-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 16 04:02:17 launchpad ollama[1540]: time=2025-02-16T04:02:17.376-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 16 04:02:17 launchpad ollama[1540]: time=2025-02-16T04:02:17.378-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 16 04:02:17 launchpad ollama[1540]: time=2025-02-16T04:02:17.380-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1089523640/runners
+Feb 16 04:02:20 launchpad ollama[1540]: time=2025-02-16T04:02:20.273-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 16 04:02:20 launchpad ollama[1540]: time=2025-02-16T04:02:20.274-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 16 04:02:20 launchpad ollama[1540]: time=2025-02-16T04:02:20.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 16 04:02:20 launchpad ollama[1540]: time=2025-02-16T04:02:20.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 16 04:02:20 launchpad ollama[1540]: time=2025-02-16T04:02:20.274-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 16 04:02:20 launchpad ollama[1540]: time=2025-02-16T04:02:20.509-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 17 13:36:27 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:36:27 | 200 |     804.242µs |       127.0.0.1 | HEAD     "/"
+Feb 17 13:36:27 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:36:27 | 200 |   16.897737ms |       127.0.0.1 | POST     "/api/show"
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.971-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9833349120 required="6.2 GiB"
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.971-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.972-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.974-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34405"
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.974-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.974-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 13:36:27 launchpad ollama[1540]: time=2025-02-17T13:36:27.974-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 13:36:28 launchpad ollama[99439]: INFO [main] build info | build=0 commit="unknown" tid="140422218096640" timestamp=1739828188
+Feb 17 13:36:28 launchpad ollama[99439]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140422218096640" timestamp=1739828188 total_threads=16
+Feb 17 13:36:28 launchpad ollama[99439]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34405" tid="140422218096640" timestamp=1739828188
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 13:36:28 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 13:36:28 launchpad ollama[1540]: time=2025-02-17T13:36:28.225-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 13:36:28 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 13:36:28 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 13:36:28 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 13:36:28 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 13:36:28 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 13:36:33 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 13:36:33 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 13:36:33 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 13:36:33 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 13:36:33 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 13:36:34 launchpad ollama[99439]: INFO [main] model loaded | tid="140422218096640" timestamp=1739828194
+Feb 17 13:36:34 launchpad ollama[1540]: time=2025-02-17T13:36:34.244-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Feb 17 13:36:34 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:36:34 | 200 |  6.447249478s |       127.0.0.1 | POST     "/api/generate"
+Feb 17 13:38:07 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:38:07 | 200 | 10.646925045s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 13:41:18 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:41:18 | 200 |  6.021633368s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.454-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9879945216 required="6.2 GiB"
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.454-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.455-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.456-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37795"
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.456-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.456-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.456-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 13:50:48 launchpad ollama[101447]: INFO [main] build info | build=0 commit="unknown" tid="140633737093120" timestamp=1739829048
+Feb 17 13:50:48 launchpad ollama[101447]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140633737093120" timestamp=1739829048 total_threads=16
+Feb 17 13:50:48 launchpad ollama[101447]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37795" tid="140633737093120" timestamp=1739829048
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 13:50:48 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 13:50:48 launchpad ollama[1540]: time=2025-02-17T13:50:48.707-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 13:50:48 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 13:50:48 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 13:50:48 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 13:50:48 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 13:50:48 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 13:50:49 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 13:50:49 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 13:50:49 launchpad ollama[101447]: INFO [main] model loaded | tid="140633737093120" timestamp=1739829049
+Feb 17 13:50:49 launchpad ollama[1540]: time=2025-02-17T13:50:49.460-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 13:50:55 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:50:55 | 200 |  7.472184967s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 13:52:48 launchpad ollama[1540]: [GIN] 2025/02/17 - 13:52:48 | 200 |  5.467955799s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.157-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9879945216 required="6.2 GiB"
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.157-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.157-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.158-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34253"
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.158-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.158-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.158-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 14:01:10 launchpad ollama[102929]: INFO [main] build info | build=0 commit="unknown" tid="139969446260736" timestamp=1739829670
+Feb 17 14:01:10 launchpad ollama[102929]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139969446260736" timestamp=1739829670 total_threads=16
+Feb 17 14:01:10 launchpad ollama[102929]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34253" tid="139969446260736" timestamp=1739829670
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 14:01:10 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 14:01:10 launchpad ollama[1540]: time=2025-02-17T14:01:10.410-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 14:01:10 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 14:01:10 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 14:01:10 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 14:01:10 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 14:01:10 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 14:01:11 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 14:01:11 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 14:01:11 launchpad ollama[102929]: INFO [main] model loaded | tid="139969446260736" timestamp=1739829671
+Feb 17 14:01:11 launchpad ollama[1540]: time=2025-02-17T14:01:11.163-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 14:01:17 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:01:17 | 200 |  7.928378858s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:04:37 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:04:37 | 200 | 15.181517664s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:06:53 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:06:53 | 200 |  6.931962169s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:11:04 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:11:04 | 200 |  6.738335961s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.636-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9884205056 required="6.2 GiB"
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.636-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.636-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.637-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40167"
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.637-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.637-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.637-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 14:16:45 launchpad ollama[105090]: INFO [main] build info | build=0 commit="unknown" tid="139986082029568" timestamp=1739830605
+Feb 17 14:16:45 launchpad ollama[105090]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139986082029568" timestamp=1739830605 total_threads=16
+Feb 17 14:16:45 launchpad ollama[105090]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40167" tid="139986082029568" timestamp=1739830605
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 14:16:45 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 14:16:45 launchpad ollama[1540]: time=2025-02-17T14:16:45.888-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 14:16:45 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 14:16:45 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 14:16:45 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 14:16:45 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 14:16:45 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 14:16:46 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 14:16:46 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 14:16:46 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 14:16:46 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 14:16:46 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 14:16:46 launchpad ollama[105090]: INFO [main] model loaded | tid="139986082029568" timestamp=1739830606
+Feb 17 14:16:46 launchpad ollama[1540]: time=2025-02-17T14:16:46.642-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 14:16:54 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:16:54 | 200 |  8.560344084s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:17:53 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:17:53 | 200 |  5.860552603s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.586-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9884205056 required="6.2 GiB"
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.586-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.587-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.588-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39049"
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.588-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.588-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.588-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 14:24:50 launchpad ollama[106236]: INFO [main] build info | build=0 commit="unknown" tid="140273477804032" timestamp=1739831090
+Feb 17 14:24:50 launchpad ollama[106236]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140273477804032" timestamp=1739831090 total_threads=16
+Feb 17 14:24:50 launchpad ollama[106236]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39049" tid="140273477804032" timestamp=1739831090
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 14:24:50 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 14:24:50 launchpad ollama[1540]: time=2025-02-17T14:24:50.839-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 14:24:50 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 14:24:50 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 14:24:50 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 14:24:50 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 14:24:50 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 14:24:51 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 14:24:51 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 14:24:51 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 14:24:51 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 14:24:51 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 14:24:51 launchpad ollama[106236]: INFO [main] model loaded | tid="140273477804032" timestamp=1739831091
+Feb 17 14:24:51 launchpad ollama[1540]: time=2025-02-17T14:24:51.592-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 14:25:01 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:25:01 | 200 | 11.294123511s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:26:56 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:26:56 | 200 | 10.134133371s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.671-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9879945216 required="6.2 GiB"
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.671-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.671-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.672-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44533"
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.672-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.672-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.672-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 14:35:45 launchpad ollama[107804]: INFO [main] build info | build=0 commit="unknown" tid="140186630414336" timestamp=1739831745
+Feb 17 14:35:45 launchpad ollama[107804]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140186630414336" timestamp=1739831745 total_threads=16
+Feb 17 14:35:45 launchpad ollama[107804]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44533" tid="140186630414336" timestamp=1739831745
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 14:35:45 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 14:35:45 launchpad ollama[1540]: time=2025-02-17T14:35:45.923-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 14:35:45 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 14:35:45 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 14:35:45 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 14:35:45 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 14:35:45 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 14:35:46 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 14:35:46 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 14:35:46 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 14:35:46 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 14:35:46 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 14:35:46 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 14:35:46 launchpad ollama[107804]: INFO [main] model loaded | tid="140186630414336" timestamp=1739831746
+Feb 17 14:35:46 launchpad ollama[1540]: time=2025-02-17T14:35:46.675-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 14:35:51 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:35:51 | 200 |   6.32260039s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:36:31 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:36:31 | 200 |    11.97683ms |       127.0.0.1 | POST     "/api/show"
+Feb 17 14:40:48 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:40:48 | 200 |   4.33500691s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:41:35 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:41:35 | 200 |   12.500274ms |       127.0.0.1 | POST     "/api/show"
+Feb 17 14:41:43 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:41:43 | 200 |   12.221463ms |       127.0.0.1 | POST     "/api/show"
+Feb 17 14:43:19 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:43:19 | 200 |  4.854734596s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:45:35 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:45:35 | 200 |   6.46467151s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:47:03 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:47:03 | 200 |  5.245958268s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.431-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9883090944 required="6.2 GiB"
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.431-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.431-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.432-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37441"
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.433-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.433-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.433-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 14:58:16 launchpad ollama[110943]: INFO [main] build info | build=0 commit="unknown" tid="140268142682112" timestamp=1739833096
+Feb 17 14:58:16 launchpad ollama[110943]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140268142682112" timestamp=1739833096 total_threads=16
+Feb 17 14:58:16 launchpad ollama[110943]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37441" tid="140268142682112" timestamp=1739833096
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 14:58:16 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 14:58:16 launchpad ollama[1540]: time=2025-02-17T14:58:16.684-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 14:58:16 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 14:58:16 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 14:58:16 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 14:58:16 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 14:58:16 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 14:58:17 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 14:58:17 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 14:58:17 launchpad ollama[110943]: INFO [main] model loaded | tid="140268142682112" timestamp=1739833097
+Feb 17 14:58:17 launchpad ollama[1540]: time=2025-02-17T14:58:17.436-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 14:58:24 launchpad ollama[1540]: [GIN] 2025/02/17 - 14:58:24 | 200 |   8.33625331s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 15:00:03 launchpad ollama[1540]: [GIN] 2025/02/17 - 15:00:03 | 200 |  6.886185006s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 15:03:25 launchpad ollama[1540]: [GIN] 2025/02/17 - 15:03:25 | 200 |  8.601641964s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.115-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9879945216 required="6.2 GiB"
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.115-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.115-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.116-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38177"
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.116-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.117-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.117-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 15:12:05 launchpad ollama[112866]: INFO [main] build info | build=0 commit="unknown" tid="140038500421632" timestamp=1739833925
+Feb 17 15:12:05 launchpad ollama[112866]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140038500421632" timestamp=1739833925 total_threads=16
+Feb 17 15:12:05 launchpad ollama[112866]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38177" tid="140038500421632" timestamp=1739833925
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 15:12:05 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 15:12:05 launchpad ollama[1540]: time=2025-02-17T15:12:05.368-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 15:12:05 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 15:12:05 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 15:12:05 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 15:12:05 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 15:12:05 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 15:12:06 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 15:12:06 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 15:12:06 launchpad ollama[112866]: INFO [main] model loaded | tid="140038500421632" timestamp=1739833926
+Feb 17 15:12:06 launchpad ollama[1540]: time=2025-02-17T15:12:06.121-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 15:12:15 launchpad ollama[1540]: [GIN] 2025/02/17 - 15:12:15 | 200 |  10.59340036s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.487-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9883090944 required="6.2 GiB"
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.487-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.7 GiB" free_swap="68.9 GiB"
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.488-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.489-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1089523640/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41091"
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.489-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.489-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.489-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 17 15:20:38 launchpad ollama[114084]: INFO [main] build info | build=0 commit="unknown" tid="140332373102592" timestamp=1739834438
+Feb 17 15:20:38 launchpad ollama[114084]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140332373102592" timestamp=1739834438 total_threads=16
+Feb 17 15:20:38 launchpad ollama[114084]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41091" tid="140332373102592" timestamp=1739834438
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - type  f32:   65 tensors
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - type q4_0:  225 tensors
+Feb 17 15:20:38 launchpad ollama[1540]: llama_model_loader: - type q6_K:    1 tensors
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_vocab: special tokens cache size = 256
+Feb 17 15:20:38 launchpad ollama[1540]: time=2025-02-17T15:20:38.740-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: arch             = llama
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: vocab type       = BPE
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_vocab          = 128256
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_merges         = 280147
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: vocab_only       = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_embd           = 4096
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_layer          = 32
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_head           = 32
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_head_kv        = 8
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_rot            = 128
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_swa            = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_gqa            = 4
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_ff             = 14336
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_expert         = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_expert_used    = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: causal attn      = 1
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: pooling type     = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: rope type        = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: rope scaling     = linear
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: freq_scale_train = 1
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: ssm_d_state      = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: model type       = 8B
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: model ftype      = Q4_0
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: model params     = 8.03 B
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_print_meta: max token length = 256
+Feb 17 15:20:38 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 17 15:20:38 launchpad ollama[1540]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 17 15:20:38 launchpad ollama[1540]: ggml_cuda_init: found 1 CUDA devices:
+Feb 17 15:20:38 launchpad ollama[1540]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 17 15:20:38 launchpad ollama[1540]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: n_ctx      = 8192
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: n_batch    = 512
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: n_ubatch   = 512
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: flash_attn = 0
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: freq_scale = 1
+Feb 17 15:20:39 launchpad ollama[1540]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: graph nodes  = 1030
+Feb 17 15:20:39 launchpad ollama[1540]: llama_new_context_with_model: graph splits = 2
+Feb 17 15:20:39 launchpad ollama[114084]: INFO [main] model loaded | tid="140332373102592" timestamp=1739834439
+Feb 17 15:20:39 launchpad ollama[1540]: time=2025-02-17T15:20:39.492-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 17 15:20:47 launchpad ollama[1540]: [GIN] 2025/02/17 - 15:20:47 | 200 |  9.225234909s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 15:23:08 launchpad ollama[1540]: [GIN] 2025/02/17 - 15:23:08 | 200 | 10.996744755s |       127.0.0.1 | POST     "/api/chat"
+Feb 17 15:23:40 launchpad ollama[1540]: [GIN] 2025/02/17 - 15:23:40 | 200 |  8.134205118s |       127.0.0.1 | POST     "/api/chat"
+Feb 21 16:18:22 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 21 16:18:23 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 21 16:18:23 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 21 16:18:23 launchpad systemd[1]: ollama.service: Consumed 3min 14.009s CPU time, 5.6G memory peak, 4.6G read from disk, 508.1M written to disk, 4.9M incoming IP traffic, 5.5M outgoing IP traffic.
+-- Boot 56a21e2208e1447e869457b88292f8e2 --
+Feb 21 16:19:16 launchpad systemd[1]: Starting Server for local large language models...
+Feb 21 16:19:16 launchpad systemd[1]: Started Server for local large language models.
+Feb 21 16:19:16 launchpad ollama[1539]: 2025/02/21 16:19:16 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 21 16:19:16 launchpad ollama[1539]: time=2025-02-21T16:19:16.227-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 21 16:19:16 launchpad ollama[1539]: time=2025-02-21T16:19:16.232-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 21 16:19:16 launchpad ollama[1539]: time=2025-02-21T16:19:16.235-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 21 16:19:16 launchpad ollama[1539]: time=2025-02-21T16:19:16.236-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1648657965/runners
+Feb 21 16:19:19 launchpad ollama[1539]: time=2025-02-21T16:19:19.116-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Feb 21 16:19:19 launchpad ollama[1539]: time=2025-02-21T16:19:19.117-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 21 16:19:19 launchpad ollama[1539]: time=2025-02-21T16:19:19.117-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 21 16:19:19 launchpad ollama[1539]: time=2025-02-21T16:19:19.117-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 21 16:19:19 launchpad ollama[1539]: time=2025-02-21T16:19:19.117-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 21 16:19:27 launchpad ollama[1539]: time=2025-02-21T16:19:27.054-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Feb 21 16:20:05 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 21 16:20:06 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 21 16:20:06 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 21 16:20:06 launchpad systemd[1]: ollama.service: Consumed 11.213s CPU time, 786.5M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot e12f595e26c445d49248c90c7f337992 --
+Feb 21 16:20:41 launchpad systemd[1]: Starting Server for local large language models...
+Feb 21 16:20:41 launchpad systemd[1]: Started Server for local large language models.
+Feb 21 16:20:41 launchpad ollama[1510]: 2025/02/21 16:20:41 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 21 16:20:41 launchpad ollama[1510]: time=2025-02-21T16:20:41.265-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 21 16:20:41 launchpad ollama[1510]: time=2025-02-21T16:20:41.269-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 21 16:20:41 launchpad ollama[1510]: time=2025-02-21T16:20:41.271-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 21 16:20:41 launchpad ollama[1510]: time=2025-02-21T16:20:41.272-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama141469161/runners
+Feb 21 16:20:44 launchpad ollama[1510]: time=2025-02-21T16:20:44.178-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 21 16:20:44 launchpad ollama[1510]: time=2025-02-21T16:20:44.179-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 21 16:20:44 launchpad ollama[1510]: time=2025-02-21T16:20:44.179-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 21 16:20:44 launchpad ollama[1510]: time=2025-02-21T16:20:44.180-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 21 16:20:44 launchpad ollama[1510]: time=2025-02-21T16:20:44.180-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 21 16:20:44 launchpad ollama[1510]: time=2025-02-21T16:20:44.400-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 22 15:10:36 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 15:10:36 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 15:10:36 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 15:10:36 launchpad systemd[1]: ollama.service: Consumed 3.725s CPU time, 786.4M memory peak, 234.3M read from disk, 508.1M written to disk.
+Feb 22 15:10:42 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 15:10:42 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 15:10:42 launchpad ollama[609219]: 2025/02/22 15:10:42 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 15:10:42 launchpad ollama[609219]: time=2025-02-22T15:10:42.546-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 15:10:42 launchpad ollama[609219]: time=2025-02-22T15:10:42.546-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 15:10:42 launchpad ollama[609219]: time=2025-02-22T15:10:42.547-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 15:10:42 launchpad ollama[609219]: time=2025-02-22T15:10:42.547-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1820237840/runners
+Feb 22 15:10:45 launchpad ollama[609219]: time=2025-02-22T15:10:45.649-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 22 15:10:45 launchpad ollama[609219]: time=2025-02-22T15:10:45.649-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 15:10:45 launchpad ollama[609219]: time=2025-02-22T15:10:45.649-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:10:45 launchpad ollama[609219]: time=2025-02-22T15:10:45.649-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:10:45 launchpad ollama[609219]: time=2025-02-22T15:10:45.649-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:10:45 launchpad ollama[609219]: time=2025-02-22T15:10:45.905-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="9.0 GiB"
+Feb 22 15:29:56 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 15:29:56 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 15:29:56 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 15:29:56 launchpad systemd[1]: ollama.service: Consumed 3.608s CPU time, 552.2M memory peak, 508.1M written to disk.
+-- Boot 2661aaee13a741189bfa7b24c1e81223 --
+Feb 22 15:30:33 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 15:30:33 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 15:30:33 launchpad ollama[1534]: 2025/02/22 15:30:33 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 15:30:33 launchpad ollama[1534]: time=2025-02-22T15:30:33.266-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 15:30:33 launchpad ollama[1534]: time=2025-02-22T15:30:33.271-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 15:30:33 launchpad ollama[1534]: time=2025-02-22T15:30:33.272-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 15:30:33 launchpad ollama[1534]: time=2025-02-22T15:30:33.275-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3061524593/runners
+Feb 22 15:30:36 launchpad ollama[1534]: time=2025-02-22T15:30:36.259-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 22 15:30:36 launchpad ollama[1534]: time=2025-02-22T15:30:36.259-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 15:30:36 launchpad ollama[1534]: time=2025-02-22T15:30:36.259-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:30:36 launchpad ollama[1534]: time=2025-02-22T15:30:36.259-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:30:36 launchpad ollama[1534]: time=2025-02-22T15:30:36.259-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:30:36 launchpad ollama[1534]: time=2025-02-22T15:30:36.441-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 22 15:35:33 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 15:35:33 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 15:35:33 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 15:35:33 launchpad systemd[1]: ollama.service: Consumed 3.404s CPU time, 787M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot bc1d123a60ee4d9eb1178bf3702c8b05 --
+Feb 22 15:36:05 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 15:36:05 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 15:36:06 launchpad ollama[1534]: 2025/02/22 15:36:06 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 15:36:06 launchpad ollama[1534]: time=2025-02-22T15:36:06.054-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 15:36:06 launchpad ollama[1534]: time=2025-02-22T15:36:06.059-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 15:36:06 launchpad ollama[1534]: time=2025-02-22T15:36:06.060-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 15:36:06 launchpad ollama[1534]: time=2025-02-22T15:36:06.061-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama504777951/runners
+Feb 22 15:36:09 launchpad ollama[1534]: time=2025-02-22T15:36:09.044-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Feb 22 15:36:09 launchpad ollama[1534]: time=2025-02-22T15:36:09.045-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 15:36:09 launchpad ollama[1534]: time=2025-02-22T15:36:09.045-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:36:09 launchpad ollama[1534]: time=2025-02-22T15:36:09.045-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:36:09 launchpad ollama[1534]: time=2025-02-22T15:36:09.045-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:36:09 launchpad ollama[1534]: time=2025-02-22T15:36:09.224-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.1 GiB"
+Feb 22 15:37:38 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 15:37:38 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 15:37:38 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 15:37:38 launchpad systemd[1]: ollama.service: Consumed 3.398s CPU time, 788.9M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 2e14837fa2344ee885c7a1f0d3779a60 --
+Feb 22 15:38:16 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 15:38:16 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 15:38:16 launchpad ollama[1531]: 2025/02/22 15:38:16 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 15:38:16 launchpad ollama[1531]: time=2025-02-22T15:38:16.410-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 15:38:16 launchpad ollama[1531]: time=2025-02-22T15:38:16.415-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 15:38:16 launchpad ollama[1531]: time=2025-02-22T15:38:16.416-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 15:38:16 launchpad ollama[1531]: time=2025-02-22T15:38:16.418-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1154028157/runners
+Feb 22 15:38:19 launchpad ollama[1531]: time=2025-02-22T15:38:19.302-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 22 15:38:19 launchpad ollama[1531]: time=2025-02-22T15:38:19.303-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 15:38:19 launchpad ollama[1531]: time=2025-02-22T15:38:19.303-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:38:19 launchpad ollama[1531]: time=2025-02-22T15:38:19.303-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:38:19 launchpad ollama[1531]: time=2025-02-22T15:38:19.303-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:38:21 launchpad ollama[1531]: time=2025-02-22T15:38:21.002-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Feb 22 15:38:39 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 15:38:39 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 15:38:39 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 15:38:39 launchpad systemd[1]: ollama.service: Consumed 5.157s CPU time, 787.5M memory peak, 234.6M read from disk, 508.1M written to disk.
+-- Boot 40e81e4e58c840da9f86fa8bdb940f2e --
+Feb 22 15:39:12 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 15:39:12 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 15:39:12 launchpad ollama[1529]: 2025/02/22 15:39:12 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 15:39:12 launchpad ollama[1529]: time=2025-02-22T15:39:12.201-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 15:39:12 launchpad ollama[1529]: time=2025-02-22T15:39:12.206-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 15:39:12 launchpad ollama[1529]: time=2025-02-22T15:39:12.206-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 15:39:12 launchpad ollama[1529]: time=2025-02-22T15:39:12.208-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1018485095/runners
+Feb 22 15:39:15 launchpad ollama[1529]: time=2025-02-22T15:39:15.159-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Feb 22 15:39:15 launchpad ollama[1529]: time=2025-02-22T15:39:15.160-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 15:39:15 launchpad ollama[1529]: time=2025-02-22T15:39:15.160-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:39:15 launchpad ollama[1529]: time=2025-02-22T15:39:15.161-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:39:15 launchpad ollama[1529]: time=2025-02-22T15:39:15.161-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 15:39:15 launchpad ollama[1529]: time=2025-02-22T15:39:15.368-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.0 GiB"
+Feb 22 17:16:05 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 17:16:05 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 17:16:05 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 17:16:05 launchpad systemd[1]: ollama.service: Consumed 3.434s CPU time, 787.6M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot f7192e1dfabf4cf78ceb4665c68773dc --
+Feb 22 17:16:36 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 17:16:37 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 17:16:37 launchpad ollama[1531]: 2025/02/22 17:16:37 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 17:16:37 launchpad ollama[1531]: time=2025-02-22T17:16:37.096-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 17:16:37 launchpad ollama[1531]: time=2025-02-22T17:16:37.101-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 17:16:37 launchpad ollama[1531]: time=2025-02-22T17:16:37.102-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 17:16:37 launchpad ollama[1531]: time=2025-02-22T17:16:37.103-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama779339256/runners
+Feb 22 17:16:40 launchpad ollama[1531]: time=2025-02-22T17:16:40.071-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Feb 22 17:16:40 launchpad ollama[1531]: time=2025-02-22T17:16:40.071-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 17:16:40 launchpad ollama[1531]: time=2025-02-22T17:16:40.072-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 17:16:40 launchpad ollama[1531]: time=2025-02-22T17:16:40.072-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 17:16:40 launchpad ollama[1531]: time=2025-02-22T17:16:40.072-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 17:16:40 launchpad ollama[1531]: time=2025-02-22T17:16:40.277-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Feb 22 17:30:21 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 22 17:30:21 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 22 17:30:21 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 22 17:30:21 launchpad systemd[1]: ollama.service: Consumed 3.403s CPU time, 788.5M memory peak, 234.8M read from disk, 508.1M written to disk.
+-- Boot a2e049ff56334ceebf2a3a5d90b82b24 --
+Feb 22 17:30:52 launchpad systemd[1]: Starting Server for local large language models...
+Feb 22 17:30:52 launchpad systemd[1]: Started Server for local large language models.
+Feb 22 17:30:52 launchpad ollama[1537]: 2025/02/22 17:30:52 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 22 17:30:52 launchpad ollama[1537]: time=2025-02-22T17:30:52.849-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 22 17:30:52 launchpad ollama[1537]: time=2025-02-22T17:30:52.854-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 22 17:30:52 launchpad ollama[1537]: time=2025-02-22T17:30:52.855-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 22 17:30:52 launchpad ollama[1537]: time=2025-02-22T17:30:52.855-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3828889336/runners
+Feb 22 17:30:55 launchpad ollama[1537]: time=2025-02-22T17:30:55.846-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Feb 22 17:30:55 launchpad ollama[1537]: time=2025-02-22T17:30:55.846-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 22 17:30:55 launchpad ollama[1537]: time=2025-02-22T17:30:55.846-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 17:30:55 launchpad ollama[1537]: time=2025-02-22T17:30:55.846-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 17:30:55 launchpad ollama[1537]: time=2025-02-22T17:30:55.846-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 22 17:30:56 launchpad ollama[1537]: time=2025-02-22T17:30:56.029-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Feb 23 09:26:59 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 23 09:26:59 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 23 09:26:59 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 23 09:26:59 launchpad systemd[1]: ollama.service: Consumed 3.455s CPU time, 788.4M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 1e58dfadc1614a5e95da04bf186aedd0 --
+Feb 23 09:27:18 launchpad systemd[1]: Starting Server for local large language models...
+Feb 23 09:27:18 launchpad systemd[1]: Started Server for local large language models.
+Feb 23 09:27:18 launchpad ollama[1566]: 2025/02/23 09:27:18 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 23 09:27:18 launchpad ollama[1566]: time=2025-02-23T09:27:18.760-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 23 09:27:18 launchpad ollama[1566]: time=2025-02-23T09:27:18.767-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 23 09:27:18 launchpad ollama[1566]: time=2025-02-23T09:27:18.768-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 23 09:27:18 launchpad ollama[1566]: time=2025-02-23T09:27:18.769-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama121305121/runners
+Feb 23 09:27:21 launchpad ollama[1566]: time=2025-02-23T09:27:21.610-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 23 09:27:21 launchpad ollama[1566]: time=2025-02-23T09:27:21.610-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 23 09:27:21 launchpad ollama[1566]: time=2025-02-23T09:27:21.610-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 23 09:27:21 launchpad ollama[1566]: time=2025-02-23T09:27:21.611-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 23 09:27:21 launchpad ollama[1566]: time=2025-02-23T09:27:21.611-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 23 12:07:02 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: State 'stop-sigterm' timed out. Killing.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Killing process 1566 (.ollama-wrapped) with signal SIGKILL.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Killing process 1623 (.ollama-wrapped) with signal SIGKILL.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Killing process 1624 (n/a) with signal SIGKILL.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Killing process 1625 (n/a) with signal SIGKILL.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Killing process 1627 (n/a) with signal SIGKILL.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Main process exited, code=killed, status=9/KILL
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Failed with result 'timeout'.
+Feb 23 12:08:32 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 23 12:08:32 launchpad systemd[1]: ollama.service: Consumed 4.704s CPU time, 771.6M memory peak, 232.9M read from disk, 508.1M written to disk.
+-- Boot cea875ad09b44a3e90809d58d6431a00 --
+Feb 23 12:09:03 launchpad systemd[1]: Starting Server for local large language models...
+Feb 23 12:09:03 launchpad systemd[1]: Started Server for local large language models.
+Feb 23 12:09:03 launchpad ollama[1533]: 2025/02/23 12:09:03 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 23 12:09:03 launchpad ollama[1533]: time=2025-02-23T12:09:03.172-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 23 12:09:03 launchpad ollama[1533]: time=2025-02-23T12:09:03.177-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 23 12:09:03 launchpad ollama[1533]: time=2025-02-23T12:09:03.178-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 23 12:09:03 launchpad ollama[1533]: time=2025-02-23T12:09:03.180-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1424745992/runners
+Feb 23 12:09:06 launchpad ollama[1533]: time=2025-02-23T12:09:06.164-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 23 12:09:06 launchpad ollama[1533]: time=2025-02-23T12:09:06.165-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 23 12:09:06 launchpad ollama[1533]: time=2025-02-23T12:09:06.165-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 23 12:09:06 launchpad ollama[1533]: time=2025-02-23T12:09:06.165-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 23 12:09:06 launchpad ollama[1533]: time=2025-02-23T12:09:06.165-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 23 12:09:06 launchpad ollama[1533]: time=2025-02-23T12:09:06.354-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Feb 26 12:22:46 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 12:22:46 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 12:22:46 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 12:22:46 launchpad systemd[1]: ollama.service: Consumed 4.400s CPU time, 787.7M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot f2764ae1480f433eb1ba957d62a7667b --
+Feb 26 12:23:26 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 12:23:26 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 12:23:26 launchpad ollama[1532]: 2025/02/26 12:23:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 12:23:26 launchpad ollama[1532]: time=2025-02-26T12:23:26.480-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 12:23:26 launchpad ollama[1532]: time=2025-02-26T12:23:26.484-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 12:23:26 launchpad ollama[1532]: time=2025-02-26T12:23:26.485-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 12:23:26 launchpad ollama[1532]: time=2025-02-26T12:23:26.488-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2149963496/runners
+Feb 26 12:23:29 launchpad ollama[1532]: time=2025-02-26T12:23:29.325-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 12:23:29 launchpad ollama[1532]: time=2025-02-26T12:23:29.325-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 12:23:29 launchpad ollama[1532]: time=2025-02-26T12:23:29.325-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:23:29 launchpad ollama[1532]: time=2025-02-26T12:23:29.326-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:23:29 launchpad ollama[1532]: time=2025-02-26T12:23:29.326-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:23:37 launchpad ollama[1532]: time=2025-02-26T12:23:37.324-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Feb 26 12:23:58 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 12:23:58 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 12:23:58 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 12:23:58 launchpad systemd[1]: ollama.service: Consumed 11.151s CPU time, 787.4M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 70b89230457f412db030b8af1136a7c8 --
+Feb 26 12:24:29 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 12:24:29 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 12:24:29 launchpad ollama[1522]: 2025/02/26 12:24:29 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 12:24:29 launchpad ollama[1522]: time=2025-02-26T12:24:29.340-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 12:24:29 launchpad ollama[1522]: time=2025-02-26T12:24:29.345-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 12:24:29 launchpad ollama[1522]: time=2025-02-26T12:24:29.346-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 12:24:29 launchpad ollama[1522]: time=2025-02-26T12:24:29.350-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3150348286/runners
+Feb 26 12:24:32 launchpad ollama[1522]: time=2025-02-26T12:24:32.288-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 12:24:32 launchpad ollama[1522]: time=2025-02-26T12:24:32.289-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 12:24:32 launchpad ollama[1522]: time=2025-02-26T12:24:32.289-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:24:32 launchpad ollama[1522]: time=2025-02-26T12:24:32.289-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:24:32 launchpad ollama[1522]: time=2025-02-26T12:24:32.289-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:24:32 launchpad ollama[1522]: time=2025-02-26T12:24:32.503-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Feb 26 12:32:47 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 12:32:47 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 12:32:47 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 12:32:47 launchpad systemd[1]: ollama.service: Consumed 3.386s CPU time, 787.8M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot 5cc17d87b335478baf55194da99ed320 --
+Feb 26 12:33:18 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 12:33:18 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 12:33:18 launchpad ollama[1523]: 2025/02/26 12:33:18 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 12:33:18 launchpad ollama[1523]: time=2025-02-26T12:33:18.901-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 12:33:18 launchpad ollama[1523]: time=2025-02-26T12:33:18.908-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 12:33:18 launchpad ollama[1523]: time=2025-02-26T12:33:18.909-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 12:33:18 launchpad ollama[1523]: time=2025-02-26T12:33:18.911-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3865012489/runners
+Feb 26 12:33:21 launchpad ollama[1523]: time=2025-02-26T12:33:21.831-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Feb 26 12:33:21 launchpad ollama[1523]: time=2025-02-26T12:33:21.832-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 12:33:21 launchpad ollama[1523]: time=2025-02-26T12:33:21.832-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:33:21 launchpad ollama[1523]: time=2025-02-26T12:33:21.832-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:33:21 launchpad ollama[1523]: time=2025-02-26T12:33:21.832-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:33:22 launchpad ollama[1523]: time=2025-02-26T12:33:22.056-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Feb 26 12:36:51 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 12:36:52 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 12:36:52 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 12:36:52 launchpad systemd[1]: ollama.service: Consumed 3.354s CPU time, 787M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 0936d1c99120419cac9a87deaba8ae5c --
+Feb 26 12:37:23 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 12:37:23 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 12:37:23 launchpad ollama[1524]: 2025/02/26 12:37:23 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 12:37:23 launchpad ollama[1524]: time=2025-02-26T12:37:23.373-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 12:37:23 launchpad ollama[1524]: time=2025-02-26T12:37:23.377-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 12:37:23 launchpad ollama[1524]: time=2025-02-26T12:37:23.381-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 12:37:23 launchpad ollama[1524]: time=2025-02-26T12:37:23.383-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1493177641/runners
+Feb 26 12:37:26 launchpad ollama[1524]: time=2025-02-26T12:37:26.363-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Feb 26 12:37:26 launchpad ollama[1524]: time=2025-02-26T12:37:26.364-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 12:37:26 launchpad ollama[1524]: time=2025-02-26T12:37:26.364-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:37:26 launchpad ollama[1524]: time=2025-02-26T12:37:26.365-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:37:26 launchpad ollama[1524]: time=2025-02-26T12:37:26.365-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 12:37:26 launchpad ollama[1524]: time=2025-02-26T12:37:26.583-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Feb 26 15:03:01 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 15:03:01 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 15:03:01 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 15:03:01 launchpad systemd[1]: ollama.service: Consumed 3.498s CPU time, 787.5M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot 97d6655fe25445a8b4cd6aba50ef3cdb --
+Feb 26 15:03:34 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 15:03:34 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 15:03:34 launchpad ollama[1578]: 2025/02/26 15:03:34 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 15:03:34 launchpad ollama[1578]: time=2025-02-26T15:03:34.360-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 15:03:34 launchpad ollama[1578]: time=2025-02-26T15:03:34.365-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 15:03:34 launchpad ollama[1578]: time=2025-02-26T15:03:34.366-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 15:03:34 launchpad ollama[1578]: time=2025-02-26T15:03:34.368-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3125874339/runners
+Feb 26 15:03:37 launchpad ollama[1578]: time=2025-02-26T15:03:37.313-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 15:03:37 launchpad ollama[1578]: time=2025-02-26T15:03:37.313-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 15:03:37 launchpad ollama[1578]: time=2025-02-26T15:03:37.313-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:03:37 launchpad ollama[1578]: time=2025-02-26T15:03:37.314-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:03:37 launchpad ollama[1578]: time=2025-02-26T15:03:37.314-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:03:37 launchpad ollama[1578]: time=2025-02-26T15:03:37.566-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Feb 26 15:05:26 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 15:05:27 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 15:05:27 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 15:05:27 launchpad systemd[1]: ollama.service: Consumed 3.373s CPU time, 787.1M memory peak, 234.6M read from disk, 508.1M written to disk.
+-- Boot 15ffaa8cc0d94398a6a4c294dd4bfd4a --
+Feb 26 15:06:23 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 15:06:23 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 15:06:23 launchpad ollama[1583]: 2025/02/26 15:06:23 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 15:06:24 launchpad ollama[1583]: time=2025-02-26T15:06:24.003-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 15:06:24 launchpad ollama[1583]: time=2025-02-26T15:06:24.008-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 15:06:24 launchpad ollama[1583]: time=2025-02-26T15:06:24.009-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 15:06:24 launchpad ollama[1583]: time=2025-02-26T15:06:24.009-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3586662773/runners
+Feb 26 15:06:27 launchpad ollama[1583]: time=2025-02-26T15:06:27.021-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 15:06:27 launchpad ollama[1583]: time=2025-02-26T15:06:27.021-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 15:06:27 launchpad ollama[1583]: time=2025-02-26T15:06:27.022-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:06:27 launchpad ollama[1583]: time=2025-02-26T15:06:27.022-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:06:27 launchpad ollama[1583]: time=2025-02-26T15:06:27.022-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:06:27 launchpad ollama[1583]: time=2025-02-26T15:06:27.260-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Feb 26 15:08:41 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 15:08:41 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 15:08:41 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 15:08:41 launchpad systemd[1]: ollama.service: Consumed 3.470s CPU time, 787.8M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot e838850f300640b481e59975fb387ccf --
+Feb 26 15:09:11 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 15:09:12 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 15:09:12 launchpad ollama[1579]: 2025/02/26 15:09:12 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 15:09:12 launchpad ollama[1579]: time=2025-02-26T15:09:12.123-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 15:09:12 launchpad ollama[1579]: time=2025-02-26T15:09:12.127-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 15:09:12 launchpad ollama[1579]: time=2025-02-26T15:09:12.128-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 15:09:12 launchpad ollama[1579]: time=2025-02-26T15:09:12.129-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3477743635/runners
+Feb 26 15:09:15 launchpad ollama[1579]: time=2025-02-26T15:09:15.064-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 15:09:15 launchpad ollama[1579]: time=2025-02-26T15:09:15.065-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 15:09:15 launchpad ollama[1579]: time=2025-02-26T15:09:15.065-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:09:15 launchpad ollama[1579]: time=2025-02-26T15:09:15.066-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:09:15 launchpad ollama[1579]: time=2025-02-26T15:09:15.066-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:09:15 launchpad ollama[1579]: time=2025-02-26T15:09:15.290-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Feb 26 15:11:28 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 15:11:29 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 15:11:29 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 15:11:29 launchpad systemd[1]: ollama.service: Consumed 3.382s CPU time, 787.2M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 1d2da796c9bd4e5ab274940ad14b6e58 --
+Feb 26 15:12:00 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 15:12:00 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 15:12:00 launchpad ollama[1574]: 2025/02/26 15:12:00 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 15:12:00 launchpad ollama[1574]: time=2025-02-26T15:12:00.581-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 15:12:00 launchpad ollama[1574]: time=2025-02-26T15:12:00.586-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 15:12:00 launchpad ollama[1574]: time=2025-02-26T15:12:00.587-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 15:12:00 launchpad ollama[1574]: time=2025-02-26T15:12:00.589-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1532247760/runners
+Feb 26 15:12:03 launchpad ollama[1574]: time=2025-02-26T15:12:03.598-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 15:12:03 launchpad ollama[1574]: time=2025-02-26T15:12:03.598-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 15:12:03 launchpad ollama[1574]: time=2025-02-26T15:12:03.598-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:12:03 launchpad ollama[1574]: time=2025-02-26T15:12:03.599-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:12:03 launchpad ollama[1574]: time=2025-02-26T15:12:03.599-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:12:03 launchpad ollama[1574]: time=2025-02-26T15:12:03.846-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Feb 26 15:18:35 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 15:18:35 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 15:18:35 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 15:18:35 launchpad systemd[1]: ollama.service: Consumed 3.432s CPU time, 787.3M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 773915b1c3354fba8ededbdbe8db91d4 --
+Feb 26 15:19:07 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 15:19:07 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 15:19:08 launchpad ollama[1580]: 2025/02/26 15:19:08 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 15:19:08 launchpad ollama[1580]: time=2025-02-26T15:19:08.077-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 15:19:08 launchpad ollama[1580]: time=2025-02-26T15:19:08.081-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 15:19:08 launchpad ollama[1580]: time=2025-02-26T15:19:08.082-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 15:19:08 launchpad ollama[1580]: time=2025-02-26T15:19:08.084-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3665229947/runners
+Feb 26 15:19:11 launchpad ollama[1580]: time=2025-02-26T15:19:11.039-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 15:19:11 launchpad ollama[1580]: time=2025-02-26T15:19:11.040-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 15:19:11 launchpad ollama[1580]: time=2025-02-26T15:19:11.040-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:19:11 launchpad ollama[1580]: time=2025-02-26T15:19:11.041-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:19:11 launchpad ollama[1580]: time=2025-02-26T15:19:11.041-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 15:19:11 launchpad ollama[1580]: time=2025-02-26T15:19:11.297-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Feb 26 17:08:19 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 17:08:20 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 17:08:20 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 17:08:20 launchpad systemd[1]: ollama.service: Consumed 3.453s CPU time, 787.6M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot d7af577310c84eb597d71261f9d9c39a --
+Feb 26 17:08:55 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 17:08:55 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 17:08:55 launchpad ollama[1574]: 2025/02/26 17:08:55 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 17:08:55 launchpad ollama[1574]: time=2025-02-26T17:08:55.289-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 17:08:55 launchpad ollama[1574]: time=2025-02-26T17:08:55.294-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 17:08:55 launchpad ollama[1574]: time=2025-02-26T17:08:55.295-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 17:08:55 launchpad ollama[1574]: time=2025-02-26T17:08:55.298-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4184677761/runners
+Feb 26 17:08:58 launchpad ollama[1574]: time=2025-02-26T17:08:58.270-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Feb 26 17:08:58 launchpad ollama[1574]: time=2025-02-26T17:08:58.271-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 17:08:58 launchpad ollama[1574]: time=2025-02-26T17:08:58.271-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:08:58 launchpad ollama[1574]: time=2025-02-26T17:08:58.271-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:08:58 launchpad ollama[1574]: time=2025-02-26T17:08:58.271-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:08:58 launchpad ollama[1574]: time=2025-02-26T17:08:58.836-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Feb 26 17:27:49 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 17:27:49 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 17:27:49 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 17:27:49 launchpad systemd[1]: ollama.service: Consumed 3.447s CPU time, 787.3M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot 219199171ac04c159dfa8abb490de196 --
+Feb 26 17:28:22 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 17:28:22 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 17:28:22 launchpad ollama[1577]: 2025/02/26 17:28:22 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 17:28:22 launchpad ollama[1577]: time=2025-02-26T17:28:22.156-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 17:28:22 launchpad ollama[1577]: time=2025-02-26T17:28:22.164-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 17:28:22 launchpad ollama[1577]: time=2025-02-26T17:28:22.165-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 17:28:22 launchpad ollama[1577]: time=2025-02-26T17:28:22.166-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1559463410/runners
+Feb 26 17:28:25 launchpad ollama[1577]: time=2025-02-26T17:28:25.084-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 17:28:25 launchpad ollama[1577]: time=2025-02-26T17:28:25.085-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 17:28:25 launchpad ollama[1577]: time=2025-02-26T17:28:25.085-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:28:25 launchpad ollama[1577]: time=2025-02-26T17:28:25.085-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:28:25 launchpad ollama[1577]: time=2025-02-26T17:28:25.085-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:28:25 launchpad ollama[1577]: time=2025-02-26T17:28:25.328-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Feb 26 17:29:35 launchpad systemd[1]: Stopping Server for local large language models...
+Feb 26 17:29:35 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Feb 26 17:29:35 launchpad systemd[1]: Stopped Server for local large language models.
+Feb 26 17:29:35 launchpad systemd[1]: ollama.service: Consumed 3.328s CPU time, 787.2M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 288488cd00584e369ea93802998b6fa9 --
+Feb 26 17:30:07 launchpad systemd[1]: Starting Server for local large language models...
+Feb 26 17:30:07 launchpad systemd[1]: Started Server for local large language models.
+Feb 26 17:30:07 launchpad ollama[1577]: 2025/02/26 17:30:07 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Feb 26 17:30:07 launchpad ollama[1577]: time=2025-02-26T17:30:07.318-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Feb 26 17:30:07 launchpad ollama[1577]: time=2025-02-26T17:30:07.324-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Feb 26 17:30:07 launchpad ollama[1577]: time=2025-02-26T17:30:07.326-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Feb 26 17:30:07 launchpad ollama[1577]: time=2025-02-26T17:30:07.328-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama85053230/runners
+Feb 26 17:30:10 launchpad ollama[1577]: time=2025-02-26T17:30:10.265-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Feb 26 17:30:10 launchpad ollama[1577]: time=2025-02-26T17:30:10.266-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Feb 26 17:30:10 launchpad ollama[1577]: time=2025-02-26T17:30:10.266-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:30:10 launchpad ollama[1577]: time=2025-02-26T17:30:10.267-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:30:10 launchpad ollama[1577]: time=2025-02-26T17:30:10.267-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Feb 26 17:30:10 launchpad ollama[1577]: time=2025-02-26T17:30:10.553-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Feb 28 14:02:12 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:02:12 | 200 |    1.209007ms |       127.0.0.1 | HEAD     "/"
+Feb 28 14:02:12 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:02:12 | 200 |   16.047359ms |       127.0.0.1 | POST     "/api/show"
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.814-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9214951424 required="6.2 GiB"
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.814-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.815-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.817-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34661"
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.817-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.817-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 14:02:12 launchpad ollama[1577]: time=2025-02-28T14:02:12.817-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 14:02:13 launchpad ollama[114394]: INFO [main] build info | build=0 commit="unknown" tid="140506088456192" timestamp=1740780133
+Feb 28 14:02:13 launchpad ollama[114394]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140506088456192" timestamp=1740780133 total_threads=16
+Feb 28 14:02:13 launchpad ollama[114394]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34661" tid="140506088456192" timestamp=1740780133
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 14:02:13 launchpad ollama[1577]: time=2025-02-28T14:02:13.068-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 14:02:13 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 14:02:13 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 14:02:13 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 14:02:13 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 14:02:13 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 14:02:13 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 14:02:18 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 14:02:18 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 14:02:18 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 14:02:18 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 14:02:18 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 14:02:19 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 14:02:19 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 14:02:19 launchpad ollama[114394]: INFO [main] model loaded | tid="140506088456192" timestamp=1740780139
+Feb 28 14:02:19 launchpad ollama[1577]: time=2025-02-28T14:02:19.346-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.53 seconds"
+Feb 28 14:02:19 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:02:19 | 200 |   6.70611953s |       127.0.0.1 | POST     "/api/generate"
+Feb 28 14:03:52 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:03:52 | 200 |   9.54814712s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 14:08:12 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:08:12 | 200 | 12.130951516s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.728-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9217769472 required="6.2 GiB"
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.728-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="68.9 GiB"
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.729-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.730-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44135"
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.730-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.730-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.730-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 14:29:20 launchpad ollama[118548]: INFO [main] build info | build=0 commit="unknown" tid="140616155516928" timestamp=1740781760
+Feb 28 14:29:20 launchpad ollama[118548]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140616155516928" timestamp=1740781760 total_threads=16
+Feb 28 14:29:20 launchpad ollama[118548]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44135" tid="140616155516928" timestamp=1740781760
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 14:29:20 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 14:29:20 launchpad ollama[1577]: time=2025-02-28T14:29:20.981-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 14:29:20 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 14:29:21 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 14:29:21 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 14:29:21 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 14:29:21 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 14:29:21 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 14:29:21 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 14:29:21 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 14:29:21 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 14:29:21 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 14:29:21 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 14:29:21 launchpad ollama[118548]: INFO [main] model loaded | tid="140616155516928" timestamp=1740781761
+Feb 28 14:29:21 launchpad ollama[1577]: time=2025-02-28T14:29:21.734-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 14:29:32 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:29:32 | 200 | 11.611788012s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.366-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9223798784 required="6.2 GiB"
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.366-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.367-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.368-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35261"
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.368-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.368-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.368-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 14:36:39 launchpad ollama[119666]: INFO [main] build info | build=0 commit="unknown" tid="140354746605568" timestamp=1740782199
+Feb 28 14:36:39 launchpad ollama[119666]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140354746605568" timestamp=1740782199 total_threads=16
+Feb 28 14:36:39 launchpad ollama[119666]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35261" tid="140354746605568" timestamp=1740782199
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 14:36:39 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 14:36:39 launchpad ollama[1577]: time=2025-02-28T14:36:39.619-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 14:36:39 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 14:36:39 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 14:36:39 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 14:36:39 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 14:36:39 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 14:36:40 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 14:36:40 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 14:36:40 launchpad ollama[119666]: INFO [main] model loaded | tid="140354746605568" timestamp=1740782200
+Feb 28 14:36:40 launchpad ollama[1577]: time=2025-02-28T14:36:40.372-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 14:36:47 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:36:47 | 200 |  8.488455937s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 14:40:21 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:40:21 | 200 | 10.054006302s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.124-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9215475712 required="6.2 GiB"
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.124-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.125-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.126-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34885"
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.126-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.126-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.126-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 14:47:20 launchpad ollama[121414]: INFO [main] build info | build=0 commit="unknown" tid="140576655659008" timestamp=1740782840
+Feb 28 14:47:20 launchpad ollama[121414]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140576655659008" timestamp=1740782840 total_threads=16
+Feb 28 14:47:20 launchpad ollama[121414]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34885" tid="140576655659008" timestamp=1740782840
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 14:47:20 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 14:47:20 launchpad ollama[1577]: time=2025-02-28T14:47:20.377-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 14:47:20 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 14:47:20 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 14:47:20 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 14:47:20 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 14:47:20 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 14:47:21 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 14:47:21 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 14:47:21 launchpad ollama[121414]: INFO [main] model loaded | tid="140576655659008" timestamp=1740782841
+Feb 28 14:47:21 launchpad ollama[1577]: time=2025-02-28T14:47:21.130-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 14:47:32 launchpad ollama[1577]: [GIN] 2025/02/28 - 14:47:32 | 200 | 12.711746214s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.291-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9215803392 required="6.2 GiB"
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.291-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.292-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.292-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39517"
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.293-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.293-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.293-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 15:01:37 launchpad ollama[123542]: INFO [main] build info | build=0 commit="unknown" tid="140152201580544" timestamp=1740783697
+Feb 28 15:01:37 launchpad ollama[123542]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140152201580544" timestamp=1740783697 total_threads=16
+Feb 28 15:01:37 launchpad ollama[123542]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39517" tid="140152201580544" timestamp=1740783697
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 15:01:37 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 15:01:37 launchpad ollama[1577]: time=2025-02-28T15:01:37.544-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 15:01:37 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 15:01:37 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 15:01:37 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 15:01:37 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 15:01:37 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 15:01:38 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 15:01:38 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 15:01:38 launchpad ollama[123542]: INFO [main] model loaded | tid="140152201580544" timestamp=1740783698
+Feb 28 15:01:38 launchpad ollama[1577]: time=2025-02-28T15:01:38.298-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 15:01:48 launchpad ollama[1577]: [GIN] 2025/02/28 - 15:01:48 | 200 | 11.520683682s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.386-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9193324544 required="6.2 GiB"
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.386-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="68.9 GiB"
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.386-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.387-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33669"
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.387-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.387-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.387-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 15:14:04 launchpad ollama[125470]: INFO [main] build info | build=0 commit="unknown" tid="140538006142976" timestamp=1740784444
+Feb 28 15:14:04 launchpad ollama[125470]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140538006142976" timestamp=1740784444 total_threads=16
+Feb 28 15:14:04 launchpad ollama[125470]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33669" tid="140538006142976" timestamp=1740784444
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 15:14:04 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 15:14:04 launchpad ollama[1577]: time=2025-02-28T15:14:04.639-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 15:14:04 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 15:14:04 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 15:14:04 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 15:14:04 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 15:14:04 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 15:14:05 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 15:14:05 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 15:14:05 launchpad ollama[125470]: INFO [main] model loaded | tid="140538006142976" timestamp=1740784445
+Feb 28 15:14:05 launchpad ollama[1577]: time=2025-02-28T15:14:05.392-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 15:14:10 launchpad ollama[1577]: [GIN] 2025/02/28 - 15:14:10 | 200 |  6.596609391s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.324-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9217507328 required="6.2 GiB"
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.324-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="68.9 GiB"
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.324-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.325-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39179"
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.325-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.325-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.325-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 15:46:22 launchpad ollama[131154]: INFO [main] build info | build=0 commit="unknown" tid="139824834080768" timestamp=1740786382
+Feb 28 15:46:22 launchpad ollama[131154]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139824834080768" timestamp=1740786382 total_threads=16
+Feb 28 15:46:22 launchpad ollama[131154]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39179" tid="139824834080768" timestamp=1740786382
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 15:46:22 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 15:46:22 launchpad ollama[1577]: time=2025-02-28T15:46:22.576-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 15:46:22 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 15:46:22 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 15:46:22 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 15:46:22 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 15:46:22 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 15:46:23 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 15:46:23 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 15:46:23 launchpad ollama[131154]: INFO [main] model loaded | tid="139824834080768" timestamp=1740786383
+Feb 28 15:46:23 launchpad ollama[1577]: time=2025-02-28T15:46:23.329-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 15:46:34 launchpad ollama[1577]: [GIN] 2025/02/28 - 15:46:34 | 200 | 12.125250601s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.659-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9217900544 required="6.2 GiB"
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.659-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="68.9 GiB"
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.659-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.661-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44653"
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.661-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.661-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.661-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 15:51:41 launchpad ollama[131955]: INFO [main] build info | build=0 commit="unknown" tid="139659496579072" timestamp=1740786701
+Feb 28 15:51:41 launchpad ollama[131955]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139659496579072" timestamp=1740786701 total_threads=16
+Feb 28 15:51:41 launchpad ollama[131955]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44653" tid="139659496579072" timestamp=1740786701
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 15:51:41 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 15:51:41 launchpad ollama[1577]: time=2025-02-28T15:51:41.912-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 15:51:41 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 15:51:41 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 15:51:41 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 15:51:41 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 15:51:41 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 15:51:42 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 15:51:42 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 15:51:42 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 15:51:42 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 15:51:42 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 15:51:42 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 15:51:42 launchpad ollama[131955]: INFO [main] model loaded | tid="139659496579072" timestamp=1740786702
+Feb 28 15:51:42 launchpad ollama[1577]: time=2025-02-28T15:51:42.665-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 15:51:51 launchpad ollama[1577]: [GIN] 2025/02/28 - 15:51:51 | 200 |  9.682349236s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.708-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9218490368 required="6.2 GiB"
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.708-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="68.9 GiB"
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.709-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.710-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45959"
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.710-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.710-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.710-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 16:01:45 launchpad ollama[133439]: INFO [main] build info | build=0 commit="unknown" tid="140079823495168" timestamp=1740787305
+Feb 28 16:01:45 launchpad ollama[133439]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140079823495168" timestamp=1740787305 total_threads=16
+Feb 28 16:01:45 launchpad ollama[133439]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45959" tid="140079823495168" timestamp=1740787305
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 16:01:45 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 16:01:45 launchpad ollama[1577]: time=2025-02-28T16:01:45.961-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 16:01:45 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 16:01:45 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 16:01:45 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 16:01:45 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 16:01:45 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 16:01:46 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 16:01:46 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 16:01:46 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 16:01:46 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 16:01:46 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 16:01:46 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 16:01:46 launchpad ollama[133439]: INFO [main] model loaded | tid="140079823495168" timestamp=1740787306
+Feb 28 16:01:46 launchpad ollama[1577]: time=2025-02-28T16:01:46.714-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 16:01:53 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:01:53 | 200 |  7.500569739s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:03:31 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:03:31 | 200 |  1.889825884s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:06:32 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:06:32 | 200 |   8.81532141s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:08:10 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:08:10 | 200 |  9.606859294s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.622-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9200926720 required="6.2 GiB"
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.622-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.622-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.623-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41925"
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.623-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.623-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.623-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 16:31:48 launchpad ollama[140473]: INFO [main] build info | build=0 commit="unknown" tid="140114432872448" timestamp=1740789108
+Feb 28 16:31:48 launchpad ollama[140473]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140114432872448" timestamp=1740789108 total_threads=16
+Feb 28 16:31:48 launchpad ollama[140473]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41925" tid="140114432872448" timestamp=1740789108
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 16:31:48 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 16:31:48 launchpad ollama[1577]: time=2025-02-28T16:31:48.874-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 16:31:48 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 16:31:48 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 16:31:48 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 16:31:48 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 16:31:48 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 16:31:49 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 16:31:49 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 16:31:49 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 16:31:49 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 16:31:49 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 16:31:49 launchpad ollama[140473]: INFO [main] model loaded | tid="140114432872448" timestamp=1740789109
+Feb 28 16:31:49 launchpad ollama[1577]: time=2025-02-28T16:31:49.627-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 16:31:56 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:31:56 | 200 |  8.361332959s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:33:50 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:33:50 | 200 |  6.532573967s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:36:27 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:36:27 | 200 |  7.005495158s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:38:35 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:38:35 | 200 |  7.225236848s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:40:39 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:40:39 | 200 |  6.244688232s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.059-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9209184256 required="6.2 GiB"
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.059-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.059-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.060-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46827"
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.060-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.060-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.060-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 16:46:29 launchpad ollama[142637]: INFO [main] build info | build=0 commit="unknown" tid="139897028616192" timestamp=1740789989
+Feb 28 16:46:29 launchpad ollama[142637]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139897028616192" timestamp=1740789989 total_threads=16
+Feb 28 16:46:29 launchpad ollama[142637]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46827" tid="139897028616192" timestamp=1740789989
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 16:46:29 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 16:46:29 launchpad ollama[1577]: time=2025-02-28T16:46:29.311-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 16:46:29 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 16:46:29 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 16:46:29 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 16:46:29 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 16:46:29 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 16:46:29 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 16:46:30 launchpad ollama[142637]: INFO [main] model loaded | tid="139897028616192" timestamp=1740789990
+Feb 28 16:46:30 launchpad ollama[1577]: time=2025-02-28T16:46:30.065-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 16:46:38 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:46:38 | 200 |  9.983171798s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.277-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9208856576 required="6.2 GiB"
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.277-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.278-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.279-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37471"
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.279-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.279-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.279-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 16:58:24 launchpad ollama[144397]: INFO [main] build info | build=0 commit="unknown" tid="140661142446080" timestamp=1740790704
+Feb 28 16:58:24 launchpad ollama[144397]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140661142446080" timestamp=1740790704 total_threads=16
+Feb 28 16:58:24 launchpad ollama[144397]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37471" tid="140661142446080" timestamp=1740790704
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 16:58:24 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 16:58:24 launchpad ollama[1577]: time=2025-02-28T16:58:24.530-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 16:58:24 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 16:58:24 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 16:58:24 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 16:58:24 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 16:58:24 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 16:58:25 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 16:58:25 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 16:58:25 launchpad ollama[144397]: INFO [main] model loaded | tid="140661142446080" timestamp=1740790705
+Feb 28 16:58:25 launchpad ollama[1577]: time=2025-02-28T16:58:25.283-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 16:58:34 launchpad ollama[1577]: [GIN] 2025/02/28 - 16:58:34 | 200 | 10.082412373s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.072-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9368961024 required="6.2 GiB"
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.072-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.072-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.073-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43047"
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.073-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.073-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.073-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 17:06:02 launchpad ollama[145542]: INFO [main] build info | build=0 commit="unknown" tid="140694722121728" timestamp=1740791162
+Feb 28 17:06:02 launchpad ollama[145542]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140694722121728" timestamp=1740791162 total_threads=16
+Feb 28 17:06:02 launchpad ollama[145542]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43047" tid="140694722121728" timestamp=1740791162
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 17:06:02 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 17:06:02 launchpad ollama[1577]: time=2025-02-28T17:06:02.324-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 17:06:02 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 17:06:02 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 17:06:02 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 17:06:02 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 17:06:02 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 17:06:02 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 17:06:03 launchpad ollama[145542]: INFO [main] model loaded | tid="140694722121728" timestamp=1740791163
+Feb 28 17:06:03 launchpad ollama[1577]: time=2025-02-28T17:06:03.077-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 17:06:13 launchpad ollama[1577]: [GIN] 2025/02/28 - 17:06:13 | 200 | 11.290778531s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.362-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9368764416 required="6.2 GiB"
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.362-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.362-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.363-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34297"
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.363-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.363-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.364-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 17:22:02 launchpad ollama[147869]: INFO [main] build info | build=0 commit="unknown" tid="139994690822144" timestamp=1740792122
+Feb 28 17:22:02 launchpad ollama[147869]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139994690822144" timestamp=1740792122 total_threads=16
+Feb 28 17:22:02 launchpad ollama[147869]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34297" tid="139994690822144" timestamp=1740792122
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 17:22:02 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 17:22:02 launchpad ollama[1577]: time=2025-02-28T17:22:02.615-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 17:22:02 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 17:22:02 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 17:22:02 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 17:22:02 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 17:22:02 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 17:22:03 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 17:22:03 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 17:22:03 launchpad ollama[147869]: INFO [main] model loaded | tid="139994690822144" timestamp=1740792123
+Feb 28 17:22:03 launchpad ollama[1577]: time=2025-02-28T17:22:03.368-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 17:22:12 launchpad ollama[1577]: [GIN] 2025/02/28 - 17:22:12 | 200 | 10.311215036s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.224-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9369092096 required="6.2 GiB"
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.224-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.224-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.225-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43575"
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.225-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.225-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.225-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 17:35:40 launchpad ollama[149852]: INFO [main] build info | build=0 commit="unknown" tid="139748682330112" timestamp=1740792940
+Feb 28 17:35:40 launchpad ollama[149852]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139748682330112" timestamp=1740792940 total_threads=16
+Feb 28 17:35:40 launchpad ollama[149852]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43575" tid="139748682330112" timestamp=1740792940
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 17:35:40 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 17:35:40 launchpad ollama[1577]: time=2025-02-28T17:35:40.476-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 17:35:40 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 17:35:40 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 17:35:40 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 17:35:40 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 17:35:40 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 17:35:41 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 17:35:41 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 17:35:41 launchpad ollama[149852]: INFO [main] model loaded | tid="139748682330112" timestamp=1740792941
+Feb 28 17:35:41 launchpad ollama[1577]: time=2025-02-28T17:35:41.229-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 17:35:51 launchpad ollama[1577]: [GIN] 2025/02/28 - 17:35:51 | 200 | 11.237254254s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.028-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9158852608 required="6.2 GiB"
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.028-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.028-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.029-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41533"
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.029-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.029-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.029-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 17:46:15 launchpad ollama[151489]: INFO [main] build info | build=0 commit="unknown" tid="139945581817856" timestamp=1740793575
+Feb 28 17:46:15 launchpad ollama[151489]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139945581817856" timestamp=1740793575 total_threads=16
+Feb 28 17:46:15 launchpad ollama[151489]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41533" tid="139945581817856" timestamp=1740793575
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 17:46:15 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 17:46:15 launchpad ollama[1577]: time=2025-02-28T17:46:15.281-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 17:46:15 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 17:46:15 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 17:46:15 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 17:46:15 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 17:46:15 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 17:46:15 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 17:46:16 launchpad ollama[151489]: INFO [main] model loaded | tid="139945581817856" timestamp=1740793576
+Feb 28 17:46:16 launchpad ollama[1577]: time=2025-02-28T17:46:16.286-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Feb 28 17:46:25 launchpad ollama[1577]: [GIN] 2025/02/28 - 17:46:25 | 200 | 10.663781668s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.765-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9159049216 required="6.2 GiB"
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.766-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.766-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.767-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37203"
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.767-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.767-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 17:55:59 launchpad ollama[1577]: time=2025-02-28T17:55:59.767-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 17:55:59 launchpad ollama[152915]: INFO [main] build info | build=0 commit="unknown" tid="139748496625664" timestamp=1740794159
+Feb 28 17:55:59 launchpad ollama[152915]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139748496625664" timestamp=1740794159 total_threads=16
+Feb 28 17:55:59 launchpad ollama[152915]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37203" tid="139748496625664" timestamp=1740794159
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 17:55:59 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 17:56:00 launchpad ollama[1577]: time=2025-02-28T17:56:00.018-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 17:56:00 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 17:56:00 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 17:56:00 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 17:56:00 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 17:56:00 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 17:56:00 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 17:56:00 launchpad ollama[152915]: INFO [main] model loaded | tid="139748496625664" timestamp=1740794160
+Feb 28 17:56:00 launchpad ollama[1577]: time=2025-02-28T17:56:00.772-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Feb 28 17:56:09 launchpad ollama[1577]: [GIN] 2025/02/28 - 17:56:09 | 200 |  9.701286012s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 17:58:09 launchpad ollama[1577]: [GIN] 2025/02/28 - 17:58:09 | 200 |  6.213241985s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.920-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9158983680 required="6.2 GiB"
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.920-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.921-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.922-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38017"
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.922-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.922-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 18:03:11 launchpad ollama[1577]: time=2025-02-28T18:03:11.922-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 18:03:11 launchpad ollama[154015]: INFO [main] build info | build=0 commit="unknown" tid="140487825596416" timestamp=1740794591
+Feb 28 18:03:11 launchpad ollama[154015]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140487825596416" timestamp=1740794591 total_threads=16
+Feb 28 18:03:11 launchpad ollama[154015]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38017" tid="140487825596416" timestamp=1740794591
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 18:03:11 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 18:03:12 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 18:03:12 launchpad ollama[1577]: time=2025-02-28T18:03:12.173-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 18:03:12 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 18:03:12 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 18:03:12 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 18:03:12 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 18:03:12 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 18:03:12 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 18:03:12 launchpad ollama[154015]: INFO [main] model loaded | tid="140487825596416" timestamp=1740794592
+Feb 28 18:03:12 launchpad ollama[1577]: time=2025-02-28T18:03:12.926-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 18:03:23 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:03:23 | 200 | 11.507349113s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.204-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9159311360 required="6.2 GiB"
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.204-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.204-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.205-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42861"
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.205-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.205-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.206-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 18:09:16 launchpad ollama[154906]: INFO [main] build info | build=0 commit="unknown" tid="139802792906752" timestamp=1740794956
+Feb 28 18:09:16 launchpad ollama[154906]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139802792906752" timestamp=1740794956 total_threads=16
+Feb 28 18:09:16 launchpad ollama[154906]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42861" tid="139802792906752" timestamp=1740794956
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 18:09:16 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 18:09:16 launchpad ollama[1577]: time=2025-02-28T18:09:16.456-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 18:09:16 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 18:09:16 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 18:09:16 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 18:09:16 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 18:09:16 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 18:09:17 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 18:09:17 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 18:09:17 launchpad ollama[154906]: INFO [main] model loaded | tid="139802792906752" timestamp=1740794957
+Feb 28 18:09:17 launchpad ollama[1577]: time=2025-02-28T18:09:17.461-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Feb 28 18:09:25 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:09:25 | 200 |  9.757300442s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.434-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9158787072 required="6.2 GiB"
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.435-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.0 GiB" free_swap="68.9 GiB"
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.435-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.436-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38313"
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.436-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.436-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.436-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 18:19:27 launchpad ollama[156404]: INFO [main] build info | build=0 commit="unknown" tid="139803478745088" timestamp=1740795567
+Feb 28 18:19:27 launchpad ollama[156404]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139803478745088" timestamp=1740795567 total_threads=16
+Feb 28 18:19:27 launchpad ollama[156404]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38313" tid="139803478745088" timestamp=1740795567
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 18:19:27 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 18:19:27 launchpad ollama[1577]: time=2025-02-28T18:19:27.687-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 18:19:27 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 18:19:27 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 18:19:27 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 18:19:27 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 18:19:27 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 18:19:28 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 18:19:28 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 18:19:28 launchpad ollama[156404]: INFO [main] model loaded | tid="139803478745088" timestamp=1740795568
+Feb 28 18:19:28 launchpad ollama[1577]: time=2025-02-28T18:19:28.440-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 18:19:34 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:19:34 | 200 |  7.046852586s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:21:45 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:21:45 | 200 |  4.693537262s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:24:54 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:24:54 | 200 |  8.155017137s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:28:27 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:28:27 | 200 |  7.947639339s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:33:12 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:33:12 | 200 |  7.435491807s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:34:39 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:34:39 | 200 |  7.483601103s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:37:23 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:37:23 | 200 | 13.887015441s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:39:39 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:39:39 | 200 | 12.413931467s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.915-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9192407040 required="6.2 GiB"
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.916-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.9 GiB" free_swap="68.9 GiB"
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.916-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.917-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36579"
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.917-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.917-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Feb 28 18:45:40 launchpad ollama[1577]: time=2025-02-28T18:45:40.917-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Feb 28 18:45:40 launchpad ollama[160387]: INFO [main] build info | build=0 commit="unknown" tid="140624644788224" timestamp=1740797140
+Feb 28 18:45:40 launchpad ollama[160387]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140624644788224" timestamp=1740797140 total_threads=16
+Feb 28 18:45:40 launchpad ollama[160387]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36579" tid="140624644788224" timestamp=1740797140
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Feb 28 18:45:40 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Feb 28 18:45:41 launchpad ollama[1577]: time=2025-02-28T18:45:41.168-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Feb 28 18:45:41 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Feb 28 18:45:41 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Feb 28 18:45:41 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Feb 28 18:45:41 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Feb 28 18:45:41 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Feb 28 18:45:41 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Feb 28 18:45:41 launchpad ollama[160387]: INFO [main] model loaded | tid="140624644788224" timestamp=1740797141
+Feb 28 18:45:41 launchpad ollama[1577]: time=2025-02-28T18:45:41.920-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Feb 28 18:45:53 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:45:53 | 200 | 13.180541336s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:48:16 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:48:16 | 200 |  5.436342371s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:49:33 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:49:33 | 200 | 13.168018795s |       127.0.0.1 | POST     "/api/chat"
+Feb 28 18:52:19 launchpad ollama[1577]: [GIN] 2025/02/28 - 18:52:19 | 200 |  4.436562112s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.424-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9200467968 required="6.2 GiB"
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.424-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.425-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.426-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36111"
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.426-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.426-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.426-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 11:12:22 launchpad ollama[175662]: INFO [main] build info | build=0 commit="unknown" tid="140502316916736" timestamp=1740856342
+Mar 01 11:12:22 launchpad ollama[175662]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140502316916736" timestamp=1740856342 total_threads=16
+Mar 01 11:12:22 launchpad ollama[175662]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36111" tid="140502316916736" timestamp=1740856342
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 11:12:22 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 11:12:22 launchpad ollama[1577]: time=2025-03-01T11:12:22.677-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 11:12:22 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 11:12:22 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 11:12:22 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 11:12:22 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 11:12:22 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 11:12:23 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 11:12:23 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 11:12:23 launchpad ollama[175662]: INFO [main] model loaded | tid="140502316916736" timestamp=1740856343
+Mar 01 11:12:23 launchpad ollama[1577]: time=2025-03-01T11:12:23.430-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 11:12:29 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:12:29 | 200 |  6.927777363s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.853-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9220128768 required="6.2 GiB"
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.853-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.853-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.854-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39789"
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.855-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.855-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 11:18:29 launchpad ollama[1577]: time=2025-03-01T11:18:29.855-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 11:18:29 launchpad ollama[176582]: INFO [main] build info | build=0 commit="unknown" tid="140010778365952" timestamp=1740856709
+Mar 01 11:18:29 launchpad ollama[176582]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140010778365952" timestamp=1740856709 total_threads=16
+Mar 01 11:18:29 launchpad ollama[176582]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39789" tid="140010778365952" timestamp=1740856709
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 11:18:29 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 11:18:30 launchpad ollama[1577]: time=2025-03-01T11:18:30.106-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 11:18:30 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 11:18:30 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 11:18:30 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 11:18:30 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 11:18:30 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 11:18:30 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 11:18:30 launchpad ollama[176582]: INFO [main] model loaded | tid="140010778365952" timestamp=1740856710
+Mar 01 11:18:30 launchpad ollama[1577]: time=2025-03-01T11:18:30.858-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 11:18:36 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:18:36 | 200 |  7.292746198s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.615-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9204793344 required="6.2 GiB"
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.615-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.615-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.616-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34011"
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.616-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.616-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.616-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 11:36:13 launchpad ollama[179702]: INFO [main] build info | build=0 commit="unknown" tid="140036986118144" timestamp=1740857773
+Mar 01 11:36:13 launchpad ollama[179702]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140036986118144" timestamp=1740857773 total_threads=16
+Mar 01 11:36:13 launchpad ollama[179702]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34011" tid="140036986118144" timestamp=1740857773
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 11:36:13 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 11:36:13 launchpad ollama[1577]: time=2025-03-01T11:36:13.867-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 11:36:13 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 11:36:13 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 11:36:13 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 11:36:13 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 11:36:13 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 11:36:14 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 11:36:14 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 11:36:14 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 11:36:14 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 11:36:14 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 11:36:14 launchpad ollama[179702]: INFO [main] model loaded | tid="140036986118144" timestamp=1740857774
+Mar 01 11:36:14 launchpad ollama[1577]: time=2025-03-01T11:36:14.621-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 11:36:19 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:36:19 | 200 |   6.23055622s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:37:46 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:37:46 | 200 |  4.635591282s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:42:43 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:42:43 | 200 |  5.334304431s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:43:36 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:43:36 | 200 |  3.389103063s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.793-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9364897792 required="6.2 GiB"
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.793-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.793-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.794-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35295"
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.795-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.795-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 11:50:35 launchpad ollama[1577]: time=2025-03-01T11:50:35.795-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 11:50:35 launchpad ollama[182689]: INFO [main] build info | build=0 commit="unknown" tid="139639387136000" timestamp=1740858635
+Mar 01 11:50:35 launchpad ollama[182689]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139639387136000" timestamp=1740858635 total_threads=16
+Mar 01 11:50:35 launchpad ollama[182689]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35295" tid="139639387136000" timestamp=1740858635
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 11:50:35 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 11:50:36 launchpad ollama[1577]: time=2025-03-01T11:50:36.045-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 11:50:36 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 11:50:36 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 11:50:36 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 11:50:36 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 11:50:36 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 11:50:36 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 11:50:36 launchpad ollama[182689]: INFO [main] model loaded | tid="139639387136000" timestamp=1740858636
+Mar 01 11:50:36 launchpad ollama[1577]: time=2025-03-01T11:50:36.798-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 11:50:43 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:50:43 | 200 |  8.024380557s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.565-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9364570112 required="6.2 GiB"
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.565-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.565-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.566-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38443"
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.566-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.566-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.566-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 11:56:24 launchpad ollama[183553]: INFO [main] build info | build=0 commit="unknown" tid="139662885605376" timestamp=1740858984
+Mar 01 11:56:24 launchpad ollama[183553]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139662885605376" timestamp=1740858984 total_threads=16
+Mar 01 11:56:24 launchpad ollama[183553]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38443" tid="139662885605376" timestamp=1740858984
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 11:56:24 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 11:56:24 launchpad ollama[1577]: time=2025-03-01T11:56:24.817-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 11:56:24 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 11:56:24 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 11:56:24 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 11:56:24 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 11:56:24 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 11:56:25 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 11:56:25 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 11:56:25 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 11:56:25 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 11:56:25 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 11:56:25 launchpad ollama[183553]: INFO [main] model loaded | tid="139662885605376" timestamp=1740858985
+Mar 01 11:56:25 launchpad ollama[1577]: time=2025-03-01T11:56:25.571-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 01 11:56:33 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:56:33 | 200 |  8.853137861s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 11:56:57 launchpad ollama[1577]: [GIN] 2025/03/01 - 11:56:57 | 200 |  4.653598121s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.717-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9365159936 required="6.2 GiB"
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.717-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.717-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.718-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36717"
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.718-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.718-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.718-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 12:04:55 launchpad ollama[184840]: INFO [main] build info | build=0 commit="unknown" tid="140640671145984" timestamp=1740859495
+Mar 01 12:04:55 launchpad ollama[184840]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140640671145984" timestamp=1740859495 total_threads=16
+Mar 01 12:04:55 launchpad ollama[184840]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36717" tid="140640671145984" timestamp=1740859495
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 12:04:55 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 12:04:55 launchpad ollama[1577]: time=2025-03-01T12:04:55.970-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 12:04:55 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 12:04:56 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 12:04:56 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 12:04:56 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 12:04:56 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 12:04:56 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 12:04:56 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 12:04:56 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 12:04:56 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 12:04:56 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 12:04:56 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 12:04:56 launchpad ollama[184840]: INFO [main] model loaded | tid="140640671145984" timestamp=1740859496
+Mar 01 12:04:56 launchpad ollama[1577]: time=2025-03-01T12:04:56.724-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 01 12:05:05 launchpad ollama[1577]: [GIN] 2025/03/01 - 12:05:05 | 200 | 10.042959498s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.518-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9160818688 required="6.2 GiB"
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.518-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.519-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.519-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama85053230/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43851"
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.520-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.520-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.520-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 12:18:10 launchpad ollama[187318]: INFO [main] build info | build=0 commit="unknown" tid="140659319099392" timestamp=1740860290
+Mar 01 12:18:10 launchpad ollama[187318]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140659319099392" timestamp=1740860290 total_threads=16
+Mar 01 12:18:10 launchpad ollama[187318]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43851" tid="140659319099392" timestamp=1740860290
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 12:18:10 launchpad ollama[1577]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_vocab: special tokens cache size = 256
+Mar 01 12:18:10 launchpad ollama[1577]: time=2025-03-01T12:18:10.770-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: arch             = llama
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: vocab type       = BPE
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_merges         = 280147
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: vocab_only       = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_embd           = 4096
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_layer          = 32
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_head           = 32
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_rot            = 128
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_swa            = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_gqa            = 4
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_ff             = 14336
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_expert         = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: causal attn      = 1
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: pooling type     = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: rope type        = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: rope scaling     = linear
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: model type       = 8B
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_print_meta: max token length = 256
+Mar 01 12:18:10 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 12:18:10 launchpad ollama[1577]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 12:18:10 launchpad ollama[1577]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 12:18:10 launchpad ollama[1577]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 12:18:10 launchpad ollama[1577]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: n_batch    = 512
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: flash_attn = 0
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: freq_scale = 1
+Mar 01 12:18:11 launchpad ollama[1577]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 12:18:11 launchpad ollama[1577]: llama_new_context_with_model: graph splits = 2
+Mar 01 12:18:11 launchpad ollama[187318]: INFO [main] model loaded | tid="140659319099392" timestamp=1740860291
+Mar 01 12:18:11 launchpad ollama[1577]: time=2025-03-01T12:18:11.523-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 12:18:20 launchpad ollama[1577]: [GIN] 2025/03/01 - 12:18:20 | 200 | 10.317186819s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:22:20 launchpad ollama[1577]: [GIN] 2025/03/01 - 12:22:20 | 200 |  2.432697878s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:22:46 launchpad ollama[1577]: [GIN] 2025/03/01 - 12:22:46 | 200 |  1.239694497s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:23:16 launchpad ollama[1577]: [GIN] 2025/03/01 - 12:23:16 | 200 |  2.620858532s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:24:32 launchpad ollama[1577]: [GIN] 2025/03/01 - 12:24:32 | 200 |  2.535180181s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:29:02 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 01 12:29:04 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 01 12:29:04 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 01 12:29:04 launchpad systemd[1]: ollama.service: Consumed 8min 8.911s CPU time, 5.7G memory peak, 4.6G read from disk, 508.1M written to disk, 15.1M incoming IP traffic, 13.8M outgoing IP traffic.
+-- Boot 326ba5c09665430bb47760ff439ddc9d --
+Mar 01 12:31:20 launchpad systemd[1]: Starting Server for local large language models...
+Mar 01 12:31:20 launchpad systemd[1]: Started Server for local large language models.
+Mar 01 12:31:20 launchpad ollama[1578]: 2025/03/01 12:31:20 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 01 12:31:20 launchpad ollama[1578]: time=2025-03-01T12:31:20.491-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 01 12:31:20 launchpad ollama[1578]: time=2025-03-01T12:31:20.497-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 01 12:31:20 launchpad ollama[1578]: time=2025-03-01T12:31:20.498-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 01 12:31:20 launchpad ollama[1578]: time=2025-03-01T12:31:20.498-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4107338876/runners
+Mar 01 12:31:23 launchpad ollama[1578]: time=2025-03-01T12:31:23.507-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Mar 01 12:31:23 launchpad ollama[1578]: time=2025-03-01T12:31:23.507-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 01 12:31:23 launchpad ollama[1578]: time=2025-03-01T12:31:23.508-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 01 12:31:23 launchpad ollama[1578]: time=2025-03-01T12:31:23.508-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 01 12:31:23 launchpad ollama[1578]: time=2025-03-01T12:31:23.508-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 01 12:31:23 launchpad ollama[1578]: time=2025-03-01T12:31:23.760-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 01 12:39:09 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:39:09 | 200 |    1.123129ms |       127.0.0.1 | HEAD     "/"
+Mar 01 12:39:09 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:39:09 | 200 |   16.386631ms |       127.0.0.1 | POST     "/api/show"
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.716-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10398269440 required="6.2 GiB"
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.717-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.4 GiB" free_swap="68.9 GiB"
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.717-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.718-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33739"
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.718-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.718-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.719-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 12:39:09 launchpad ollama[6231]: INFO [main] build info | build=0 commit="unknown" tid="140044305031168" timestamp=1740861549
+Mar 01 12:39:09 launchpad ollama[6231]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140044305031168" timestamp=1740861549 total_threads=16
+Mar 01 12:39:09 launchpad ollama[6231]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33739" tid="140044305031168" timestamp=1740861549
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 12:39:09 launchpad ollama[1578]: time=2025-03-01T12:39:09.970-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 12:39:09 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 01 12:39:10 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 12:39:10 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 12:39:10 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 12:39:10 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 12:39:10 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 12:39:15 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 12:39:15 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 12:39:15 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 01 12:39:15 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 12:39:15 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 01 12:39:15 launchpad ollama[6231]: INFO [main] model loaded | tid="140044305031168" timestamp=1740861555
+Mar 01 12:39:15 launchpad ollama[1578]: time=2025-03-01T12:39:15.989-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Mar 01 12:39:15 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:39:15 | 200 |   6.44667237s |       127.0.0.1 | POST     "/api/generate"
+Mar 01 12:41:46 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:41:46 | 200 |  8.404180719s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.817-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10412621824 required="6.2 GiB"
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.817-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.4 GiB" free_swap="68.9 GiB"
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.818-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.818-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38815"
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.819-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.819-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 12:47:15 launchpad ollama[1578]: time=2025-03-01T12:47:15.819-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 12:47:15 launchpad ollama[7489]: INFO [main] build info | build=0 commit="unknown" tid="140573945192448" timestamp=1740862035
+Mar 01 12:47:15 launchpad ollama[7489]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140573945192448" timestamp=1740862035 total_threads=16
+Mar 01 12:47:15 launchpad ollama[7489]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38815" tid="140573945192448" timestamp=1740862035
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 12:47:15 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 01 12:47:16 launchpad ollama[1578]: time=2025-03-01T12:47:16.070-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 01 12:47:16 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 12:47:16 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 12:47:16 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 12:47:16 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 01 12:47:16 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 12:47:16 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 01 12:47:16 launchpad ollama[7489]: INFO [main] model loaded | tid="140573945192448" timestamp=1740862036
+Mar 01 12:47:16 launchpad ollama[1578]: time=2025-03-01T12:47:16.822-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 12:47:22 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:47:22 | 200 |   7.34386985s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.625-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10420944896 required="6.2 GiB"
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.625-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.4 GiB" free_swap="68.9 GiB"
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.626-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.626-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 32795"
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.627-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.627-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.627-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 01 12:55:11 launchpad ollama[8645]: INFO [main] build info | build=0 commit="unknown" tid="140264708308992" timestamp=1740862511
+Mar 01 12:55:11 launchpad ollama[8645]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140264708308992" timestamp=1740862511 total_threads=16
+Mar 01 12:55:11 launchpad ollama[8645]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32795" tid="140264708308992" timestamp=1740862511
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 01 12:55:11 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 01 12:55:11 launchpad ollama[1578]: time=2025-03-01T12:55:11.878-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 01 12:55:11 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 01 12:55:11 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 01 12:55:11 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 01 12:55:11 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 01 12:55:11 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 01 12:55:12 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 01 12:55:12 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 01 12:55:12 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 01 12:55:12 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 01 12:55:12 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 01 12:55:12 launchpad ollama[8645]: INFO [main] model loaded | tid="140264708308992" timestamp=1740862512
+Mar 01 12:55:12 launchpad ollama[1578]: time=2025-03-01T12:55:12.631-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 01 12:55:21 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:55:21 | 200 | 10.523565209s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 12:58:22 launchpad ollama[1578]: [GIN] 2025/03/01 - 12:58:22 | 200 |  7.212702764s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 13:02:07 launchpad ollama[1578]: [GIN] 2025/03/01 - 13:02:07 | 200 |  6.981875324s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 13:05:28 launchpad ollama[1578]: [GIN] 2025/03/01 - 13:05:28 | 200 |  9.350942946s |       127.0.0.1 | POST     "/api/chat"
+Mar 01 13:09:20 launchpad ollama[1578]: [GIN] 2025/03/01 - 13:09:20 | 200 |  7.443790312s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 17:47:32 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:47:32 | 200 |      16.092µs |       127.0.0.1 | HEAD     "/"
+Mar 03 17:47:32 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:47:32 | 200 |   12.822257ms |       127.0.0.1 | POST     "/api/show"
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.738-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10137108480 required="6.2 GiB"
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.738-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.739-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.740-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33649"
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.740-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.740-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.740-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 03 17:47:32 launchpad ollama[138722]: INFO [main] build info | build=0 commit="unknown" tid="140182838722560" timestamp=1741052852
+Mar 03 17:47:32 launchpad ollama[138722]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140182838722560" timestamp=1741052852 total_threads=16
+Mar 03 17:47:32 launchpad ollama[138722]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33649" tid="140182838722560" timestamp=1741052852
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 03 17:47:32 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 03 17:47:32 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 03 17:47:32 launchpad ollama[1578]: time=2025-03-03T17:47:32.991-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 03 17:47:33 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 03 17:47:33 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 03 17:47:33 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 03 17:47:33 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 03 17:47:33 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 03 17:47:33 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 03 17:47:33 launchpad ollama[138722]: INFO [main] model loaded | tid="140182838722560" timestamp=1741052853
+Mar 03 17:47:33 launchpad ollama[1578]: time=2025-03-03T17:47:33.744-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 03 17:47:33 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:47:33 | 200 |    1.1873722s |       127.0.0.1 | POST     "/api/generate"
+Mar 03 17:48:57 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:48:57 | 200 |  9.301976434s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 17:50:07 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:50:07 | 200 |  2.790235696s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 17:51:23 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:51:23 | 200 |  2.949668506s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 17:53:43 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:53:43 | 200 |  7.528375037s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 17:58:46 launchpad ollama[1578]: [GIN] 2025/03/03 - 17:58:46 | 200 |  5.971180011s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:01:10 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:01:10 | 200 |  6.007280076s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:03:23 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:03:23 | 200 |  7.641810212s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:06:14 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:06:14 | 200 | 11.400963968s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:09:00 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:09:00 | 200 | 10.117915402s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:11:41 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:11:41 | 200 | 11.023541934s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:14:40 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:14:40 | 200 |  6.319622758s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:16:36 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:16:36 | 200 | 12.224033757s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:20:15 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:20:15 | 200 | 12.105570051s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:22:13 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:22:13 | 200 | 12.579028629s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:25:39 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:25:39 | 200 |   8.22646555s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.461-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10137763840 required="6.2 GiB"
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.461-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.462-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.463-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41337"
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.463-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.463-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.463-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 03 18:47:32 launchpad ollama[150110]: INFO [main] build info | build=0 commit="unknown" tid="140101783797760" timestamp=1741056452
+Mar 03 18:47:32 launchpad ollama[150110]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140101783797760" timestamp=1741056452 total_threads=16
+Mar 03 18:47:32 launchpad ollama[150110]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41337" tid="140101783797760" timestamp=1741056452
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 03 18:47:32 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 03 18:47:32 launchpad ollama[1578]: time=2025-03-03T18:47:32.714-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 03 18:47:32 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 03 18:47:32 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 03 18:47:32 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 03 18:47:32 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 03 18:47:32 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 03 18:47:33 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 03 18:47:33 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 03 18:47:33 launchpad ollama[150110]: INFO [main] model loaded | tid="140101783797760" timestamp=1741056453
+Mar 03 18:47:33 launchpad ollama[1578]: time=2025-03-03T18:47:33.466-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 03 18:47:42 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:47:42 | 200 | 10.500599287s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:52:22 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:52:22 | 200 |  7.587219807s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:54:27 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:54:27 | 200 |  8.470405711s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:57:52 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:57:52 | 200 |  4.447967803s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 18:59:14 launchpad ollama[1578]: [GIN] 2025/03/03 - 18:59:14 | 200 |  2.824189707s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:03:59 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:03:59 | 200 | 10.396395011s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:07:13 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:07:13 | 200 |  6.766175494s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:11:18 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:11:18 | 200 |  7.686619617s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:13:03 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:13:03 | 200 |  7.175445461s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:14:48 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:14:48 | 200 |  5.937771734s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:19:20 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:19:20 | 200 |  5.136648442s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.371-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10108403712 required="6.2 GiB"
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.371-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.6 GiB" free_swap="68.9 GiB"
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.372-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.373-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39841"
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.373-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.373-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.373-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 03 19:26:01 launchpad ollama[156193]: INFO [main] build info | build=0 commit="unknown" tid="140256606535680" timestamp=1741058761
+Mar 03 19:26:01 launchpad ollama[156193]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140256606535680" timestamp=1741058761 total_threads=16
+Mar 03 19:26:01 launchpad ollama[156193]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39841" tid="140256606535680" timestamp=1741058761
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 03 19:26:01 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 03 19:26:01 launchpad ollama[1578]: time=2025-03-03T19:26:01.624-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 03 19:26:01 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 03 19:26:01 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 03 19:26:01 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 03 19:26:01 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 03 19:26:01 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 03 19:26:02 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 03 19:26:02 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 03 19:26:02 launchpad ollama[156193]: INFO [main] model loaded | tid="140256606535680" timestamp=1741058762
+Mar 03 19:26:02 launchpad ollama[1578]: time=2025-03-03T19:26:02.376-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 03 19:26:09 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:26:09 | 200 |  8.429151547s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.128-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10108403712 required="6.2 GiB"
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.128-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.6 GiB" free_swap="68.9 GiB"
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.128-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.129-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41563"
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.129-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.129-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.129-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 03 19:32:12 launchpad ollama[157111]: INFO [main] build info | build=0 commit="unknown" tid="139874576224256" timestamp=1741059132
+Mar 03 19:32:12 launchpad ollama[157111]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139874576224256" timestamp=1741059132 total_threads=16
+Mar 03 19:32:12 launchpad ollama[157111]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41563" tid="139874576224256" timestamp=1741059132
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 03 19:32:12 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 03 19:32:12 launchpad ollama[1578]: time=2025-03-03T19:32:12.381-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 03 19:32:12 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 03 19:32:12 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 03 19:32:12 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 03 19:32:12 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 03 19:32:12 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 03 19:32:13 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 03 19:32:13 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 03 19:32:13 launchpad ollama[157111]: INFO [main] model loaded | tid="139874576224256" timestamp=1741059133
+Mar 03 19:32:13 launchpad ollama[1578]: time=2025-03-03T19:32:13.134-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 03 19:32:19 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:32:19 | 200 |  7.977784337s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:35:47 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:35:47 | 200 |  6.735567135s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:38:39 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:38:39 | 200 |  6.162431839s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:40:49 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:40:49 | 200 |  5.969356111s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:44:19 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:44:19 | 200 |  5.933615782s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:48:50 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:48:50 | 200 |  5.946570658s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:51:25 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:51:25 | 200 |  5.526996601s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:52:54 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:52:54 | 200 |  5.919910557s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:54:24 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:54:24 | 200 |  6.540529442s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 19:58:45 launchpad ollama[1578]: [GIN] 2025/03/03 - 19:58:45 | 200 |  7.570463036s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.246-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10131800064 required="6.2 GiB"
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.247-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.7 GiB" free_swap="68.9 GiB"
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.247-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.248-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33455"
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.248-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.248-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.248-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 03 20:03:57 launchpad ollama[161748]: INFO [main] build info | build=0 commit="unknown" tid="140155033649152" timestamp=1741061037
+Mar 03 20:03:57 launchpad ollama[161748]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140155033649152" timestamp=1741061037 total_threads=16
+Mar 03 20:03:57 launchpad ollama[161748]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33455" tid="140155033649152" timestamp=1741061037
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 03 20:03:57 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 03 20:03:57 launchpad ollama[1578]: time=2025-03-03T20:03:57.499-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 03 20:03:57 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 03 20:03:57 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 03 20:03:57 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 03 20:03:57 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 03 20:03:57 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 03 20:03:58 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 03 20:03:58 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 03 20:03:58 launchpad ollama[161748]: INFO [main] model loaded | tid="140155033649152" timestamp=1741061038
+Mar 03 20:03:58 launchpad ollama[1578]: time=2025-03-03T20:03:58.252-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 03 20:04:04 launchpad ollama[1578]: [GIN] 2025/03/03 - 20:04:04 | 200 |   7.82306194s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 20:05:58 launchpad ollama[1578]: [GIN] 2025/03/03 - 20:05:58 | 200 |  5.359156714s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 20:10:15 launchpad ollama[1578]: [GIN] 2025/03/03 - 20:10:15 | 200 |  7.520659398s |       127.0.0.1 | POST     "/api/chat"
+Mar 03 20:13:14 launchpad ollama[1578]: [GIN] 2025/03/03 - 20:13:14 | 200 |  5.873403815s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.444-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9873063936 required="6.2 GiB"
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.444-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.444-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.445-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41855"
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.445-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.445-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.445-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 12:31:52 launchpad ollama[194341]: INFO [main] build info | build=0 commit="unknown" tid="139834739843072" timestamp=1741120312
+Mar 04 12:31:52 launchpad ollama[194341]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139834739843072" timestamp=1741120312 total_threads=16
+Mar 04 12:31:52 launchpad ollama[194341]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41855" tid="139834739843072" timestamp=1741120312
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 12:31:52 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 12:31:52 launchpad ollama[1578]: time=2025-03-04T12:31:52.697-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 12:31:52 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 12:31:52 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 12:31:52 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 12:31:52 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 12:31:52 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 12:31:53 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 12:31:53 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 12:31:53 launchpad ollama[194341]: INFO [main] model loaded | tid="139834739843072" timestamp=1741120313
+Mar 04 12:31:53 launchpad ollama[1578]: time=2025-03-04T12:31:53.450-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 12:32:02 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:32:02 | 200 |  9.805607255s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:33:35 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:33:35 | 200 | 12.595152534s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.595-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9874112512 required="6.2 GiB"
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.595-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.595-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.596-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37311"
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.596-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.596-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.596-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 12:38:48 launchpad ollama[195404]: INFO [main] build info | build=0 commit="unknown" tid="140578525372416" timestamp=1741120728
+Mar 04 12:38:48 launchpad ollama[195404]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140578525372416" timestamp=1741120728 total_threads=16
+Mar 04 12:38:48 launchpad ollama[195404]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37311" tid="140578525372416" timestamp=1741120728
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 12:38:48 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 12:38:48 launchpad ollama[1578]: time=2025-03-04T12:38:48.848-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 12:38:48 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 12:38:48 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 12:38:48 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 12:38:48 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 12:38:48 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 12:38:49 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 12:38:49 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 12:38:49 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 12:38:49 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 12:38:49 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 12:38:49 launchpad ollama[195404]: INFO [main] model loaded | tid="140578525372416" timestamp=1741120729
+Mar 04 12:38:49 launchpad ollama[1578]: time=2025-03-04T12:38:49.601-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 12:38:59 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:38:59 | 200 | 11.390635217s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.718-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9873457152 required="6.2 GiB"
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.719-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.719-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.720-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34873"
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.720-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.720-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.720-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 12:45:06 launchpad ollama[196314]: INFO [main] build info | build=0 commit="unknown" tid="140391407271936" timestamp=1741121106
+Mar 04 12:45:06 launchpad ollama[196314]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140391407271936" timestamp=1741121106 total_threads=16
+Mar 04 12:45:06 launchpad ollama[196314]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34873" tid="140391407271936" timestamp=1741121106
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 12:45:06 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 12:45:06 launchpad ollama[1578]: time=2025-03-04T12:45:06.972-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 12:45:06 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 12:45:06 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 12:45:06 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 12:45:06 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 12:45:06 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 12:45:07 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 12:45:07 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 12:45:07 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 12:45:07 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 12:45:07 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 12:45:07 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 12:45:07 launchpad ollama[196314]: INFO [main] model loaded | tid="140391407271936" timestamp=1741121107
+Mar 04 12:45:07 launchpad ollama[1578]: time=2025-03-04T12:45:07.726-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 04 12:45:10 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:45:10 | 200 |   3.63866554s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:45:42 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:45:42 | 200 |  6.699999806s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:47:56 launchpad ollama[196314]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1012 n_keep=24 n_left=2024 n_shift=1012 tid="140391407271936" timestamp=1741121276
+Mar 04 12:48:07 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:48:07 | 200 | 11.361400348s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 12:50:31 launchpad ollama[1578]: [GIN] 2025/03/04 - 12:50:31 | 200 |  7.939433655s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.087-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9885777920 required="6.2 GiB"
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.087-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.088-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.089-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39039"
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.089-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.089-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.089-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 13:07:23 launchpad ollama[200031]: INFO [main] build info | build=0 commit="unknown" tid="140166450544640" timestamp=1741122443
+Mar 04 13:07:23 launchpad ollama[200031]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140166450544640" timestamp=1741122443 total_threads=16
+Mar 04 13:07:23 launchpad ollama[200031]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39039" tid="140166450544640" timestamp=1741122443
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 13:07:23 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 13:07:23 launchpad ollama[1578]: time=2025-03-04T13:07:23.339-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 13:07:23 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 13:07:23 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 13:07:23 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 13:07:23 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 13:07:23 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 13:07:23 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 13:07:24 launchpad ollama[200031]: INFO [main] model loaded | tid="140166450544640" timestamp=1741122444
+Mar 04 13:07:24 launchpad ollama[1578]: time=2025-03-04T13:07:24.092-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 13:07:31 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:07:31 | 200 |  8.591458449s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.052-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9885908992 required="6.2 GiB"
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.052-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.052-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.053-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36843"
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.053-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.053-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.053-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 13:26:13 launchpad ollama[202775]: INFO [main] build info | build=0 commit="unknown" tid="139948605792256" timestamp=1741123573
+Mar 04 13:26:13 launchpad ollama[202775]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139948605792256" timestamp=1741123573 total_threads=16
+Mar 04 13:26:13 launchpad ollama[202775]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36843" tid="139948605792256" timestamp=1741123573
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 13:26:13 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 13:26:13 launchpad ollama[1578]: time=2025-03-04T13:26:13.305-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 13:26:13 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 13:26:13 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 13:26:13 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 13:26:13 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 13:26:13 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 13:26:13 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 13:26:14 launchpad ollama[202775]: INFO [main] model loaded | tid="139948605792256" timestamp=1741123574
+Mar 04 13:26:14 launchpad ollama[1578]: time=2025-03-04T13:26:14.057-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 13:26:22 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:26:22 | 200 |  9.800858269s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.013-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9942859776 required="6.2 GiB"
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.013-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.013-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.014-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36987"
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.014-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.014-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.014-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 13:31:25 launchpad ollama[203752]: INFO [main] build info | build=0 commit="unknown" tid="139659855163392" timestamp=1741123885
+Mar 04 13:31:25 launchpad ollama[203752]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139659855163392" timestamp=1741123885 total_threads=16
+Mar 04 13:31:25 launchpad ollama[203752]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36987" tid="139659855163392" timestamp=1741123885
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 13:31:25 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 13:31:25 launchpad ollama[1578]: time=2025-03-04T13:31:25.266-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 13:31:25 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 13:31:25 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 13:31:25 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 13:31:25 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 13:31:25 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 13:31:25 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 13:31:25 launchpad ollama[203752]: INFO [main] model loaded | tid="139659855163392" timestamp=1741123885
+Mar 04 13:31:26 launchpad ollama[1578]: time=2025-03-04T13:31:26.019-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 13:31:33 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:31:33 | 200 |  8.730906641s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:36:27 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:36:27 | 200 |  7.911130843s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:38:27 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:38:27 | 200 |  4.974464312s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.772-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9953083392 required="6.2 GiB"
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.772-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.772-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.773-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33513"
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.774-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.774-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 13:44:52 launchpad ollama[1578]: time=2025-03-04T13:44:52.774-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 13:44:52 launchpad ollama[205875]: INFO [main] build info | build=0 commit="unknown" tid="140206603853824" timestamp=1741124692
+Mar 04 13:44:52 launchpad ollama[205875]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140206603853824" timestamp=1741124692 total_threads=16
+Mar 04 13:44:52 launchpad ollama[205875]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33513" tid="140206603853824" timestamp=1741124692
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 13:44:52 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 13:44:53 launchpad ollama[1578]: time=2025-03-04T13:44:53.025-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 13:44:53 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 13:44:53 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 13:44:53 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 13:44:53 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 13:44:53 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 13:44:53 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 13:44:53 launchpad ollama[205875]: INFO [main] model loaded | tid="140206603853824" timestamp=1741124693
+Mar 04 13:44:53 launchpad ollama[1578]: time=2025-03-04T13:44:53.778-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 13:45:01 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:45:01 | 200 |  8.964763487s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:48:43 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:48:43 | 200 |   6.12199874s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:51:48 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:51:48 | 200 |  5.077748233s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 13:55:25 launchpad ollama[1578]: [GIN] 2025/03/04 - 13:55:25 | 200 |  4.528189605s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.855-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9737601024 required="6.2 GiB"
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.855-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.855-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.856-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36371"
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.856-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.856-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 15:15:47 launchpad ollama[1578]: time=2025-03-04T15:15:47.856-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 15:15:47 launchpad ollama[220785]: INFO [main] build info | build=0 commit="unknown" tid="140468030492672" timestamp=1741130147
+Mar 04 15:15:47 launchpad ollama[220785]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140468030492672" timestamp=1741130147 total_threads=16
+Mar 04 15:15:47 launchpad ollama[220785]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36371" tid="140468030492672" timestamp=1741130147
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 15:15:47 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 15:15:48 launchpad ollama[1578]: time=2025-03-04T15:15:48.107-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 15:15:48 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 15:15:48 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 15:15:48 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 15:15:48 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 15:15:48 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 15:15:48 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 15:15:48 launchpad ollama[220785]: INFO [main] model loaded | tid="140468030492672" timestamp=1741130148
+Mar 04 15:15:48 launchpad ollama[1578]: time=2025-03-04T15:15:48.860-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 15:15:54 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:15:54 | 200 |  6.965924507s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:17:34 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:17:34 | 200 |   5.09332902s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:20:53 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:20:53 | 200 |  5.454866611s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:23:09 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:23:09 | 200 |  7.166144255s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:25:13 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:25:13 | 200 |  4.137473438s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:25:59 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:25:59 | 200 |  4.413648885s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:30:09 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:30:09 | 200 |  5.752504454s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:32:21 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:32:21 | 200 |  4.525342001s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:33:08 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:33:08 | 200 |  3.838820963s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:35:01 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:35:01 | 200 |   8.24367465s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:39:26 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:39:26 | 200 |  8.160357469s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:42:08 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:42:08 | 200 | 12.372227225s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 15:45:40 launchpad ollama[1578]: [GIN] 2025/03/04 - 15:45:40 | 200 |  11.56729297s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.656-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9806938112 required="6.2 GiB"
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.656-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.656-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.657-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33095"
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.657-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.657-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.658-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 16:03:55 launchpad ollama[227948]: INFO [main] build info | build=0 commit="unknown" tid="140104411656192" timestamp=1741133035
+Mar 04 16:03:55 launchpad ollama[227948]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140104411656192" timestamp=1741133035 total_threads=16
+Mar 04 16:03:55 launchpad ollama[227948]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33095" tid="140104411656192" timestamp=1741133035
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 16:03:55 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 16:03:55 launchpad ollama[1578]: time=2025-03-04T16:03:55.909-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 16:03:55 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 16:03:55 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 16:03:55 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 16:03:55 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 16:03:55 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 16:03:56 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 16:03:56 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 16:03:56 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 16:03:56 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 16:03:56 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 16:03:56 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 16:03:56 launchpad ollama[227948]: INFO [main] model loaded | tid="140104411656192" timestamp=1741133036
+Mar 04 16:03:56 launchpad ollama[1578]: time=2025-03-04T16:03:56.662-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 16:04:04 launchpad ollama[1578]: [GIN] 2025/03/04 - 16:04:04 | 200 |   9.37725366s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.207-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9806741504 required="6.2 GiB"
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.207-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.207-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.208-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama4107338876/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42015"
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.208-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.208-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.208-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 04 16:09:33 launchpad ollama[228777]: INFO [main] build info | build=0 commit="unknown" tid="140637111410688" timestamp=1741133373
+Mar 04 16:09:33 launchpad ollama[228777]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140637111410688" timestamp=1741133373 total_threads=16
+Mar 04 16:09:33 launchpad ollama[228777]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42015" tid="140637111410688" timestamp=1741133373
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - type  f32:   65 tensors
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - type q4_0:  225 tensors
+Mar 04 16:09:33 launchpad ollama[1578]: llama_model_loader: - type q6_K:    1 tensors
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_vocab: special tokens cache size = 256
+Mar 04 16:09:33 launchpad ollama[1578]: time=2025-03-04T16:09:33.459-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: arch             = llama
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: vocab type       = BPE
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_vocab          = 128256
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_merges         = 280147
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: vocab_only       = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_embd           = 4096
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_layer          = 32
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_head           = 32
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_head_kv        = 8
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_rot            = 128
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_swa            = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_gqa            = 4
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_ff             = 14336
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_expert         = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_expert_used    = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: causal attn      = 1
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: pooling type     = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: rope type        = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: rope scaling     = linear
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: freq_scale_train = 1
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: ssm_d_state      = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: model type       = 8B
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: model ftype      = Q4_0
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: model params     = 8.03 B
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_print_meta: max token length = 256
+Mar 04 16:09:33 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 04 16:09:33 launchpad ollama[1578]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 04 16:09:33 launchpad ollama[1578]: ggml_cuda_init: found 1 CUDA devices:
+Mar 04 16:09:33 launchpad ollama[1578]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 04 16:09:33 launchpad ollama[1578]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: n_ctx      = 8192
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: n_batch    = 512
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: n_ubatch   = 512
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: flash_attn = 0
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: freq_scale = 1
+Mar 04 16:09:34 launchpad ollama[1578]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: graph nodes  = 1030
+Mar 04 16:09:34 launchpad ollama[1578]: llama_new_context_with_model: graph splits = 2
+Mar 04 16:09:34 launchpad ollama[228777]: INFO [main] model loaded | tid="140637111410688" timestamp=1741133374
+Mar 04 16:09:34 launchpad ollama[1578]: time=2025-03-04T16:09:34.212-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 04 16:09:41 launchpad ollama[1578]: [GIN] 2025/03/04 - 16:09:41 | 200 |  8.484282432s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 16:12:05 launchpad ollama[1578]: [GIN] 2025/03/04 - 16:12:05 | 200 |  6.186185362s |       127.0.0.1 | POST     "/api/chat"
+Mar 04 16:15:49 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 04 16:15:51 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 04 16:15:51 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 04 16:15:51 launchpad systemd[1]: ollama.service: Consumed 10min 14.208s CPU time, 5.6G memory peak, 4.6G read from disk, 508.1M written to disk, 21.8M incoming IP traffic, 19.4M outgoing IP traffic.
+-- Boot 7ef76d8f1ea24523977a1c63a31424bf --
+Mar 04 16:16:37 launchpad systemd[1]: Starting Server for local large language models...
+Mar 04 16:16:37 launchpad systemd[1]: Started Server for local large language models.
+Mar 04 16:16:37 launchpad ollama[1574]: 2025/03/04 16:16:37 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 04 16:16:37 launchpad ollama[1574]: time=2025-03-04T16:16:37.511-08:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 04 16:16:37 launchpad ollama[1574]: time=2025-03-04T16:16:37.516-08:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 04 16:16:37 launchpad ollama[1574]: time=2025-03-04T16:16:37.516-08:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 04 16:16:37 launchpad ollama[1574]: time=2025-03-04T16:16:37.518-08:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1010747878/runners
+Mar 04 16:16:40 launchpad ollama[1574]: time=2025-03-04T16:16:40.519-08:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 04 16:16:40 launchpad ollama[1574]: time=2025-03-04T16:16:40.519-08:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 04 16:16:40 launchpad ollama[1574]: time=2025-03-04T16:16:40.519-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 04 16:16:40 launchpad ollama[1574]: time=2025-03-04T16:16:40.520-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 04 16:16:40 launchpad ollama[1574]: time=2025-03-04T16:16:40.520-08:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 04 16:16:40 launchpad ollama[1574]: time=2025-03-04T16:16:40.763-08:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 05 14:05:19 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:05:19 | 200 |    1.265493ms |       127.0.0.1 | HEAD     "/"
+Mar 05 14:05:19 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:05:19 | 200 |    16.01174ms |       127.0.0.1 | POST     "/api/show"
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.344-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9879683072 required="6.2 GiB"
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.344-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.344-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.345-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38609"
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.346-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.346-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.346-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 14:05:19 launchpad ollama[83604]: INFO [main] build info | build=0 commit="unknown" tid="139986878709760" timestamp=1741212319
+Mar 05 14:05:19 launchpad ollama[83604]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139986878709760" timestamp=1741212319 total_threads=16
+Mar 05 14:05:19 launchpad ollama[83604]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38609" tid="139986878709760" timestamp=1741212319
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 14:05:19 launchpad ollama[1574]: time=2025-03-05T14:05:19.597-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 14:05:19 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 14:05:19 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 14:05:19 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 14:05:19 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 14:05:19 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 14:05:19 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 14:05:25 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 14:05:25 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 14:05:25 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 14:05:25 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 14:05:25 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 14:05:25 launchpad ollama[83604]: INFO [main] model loaded | tid="139986878709760" timestamp=1741212325
+Mar 05 14:05:25 launchpad ollama[1574]: time=2025-03-05T14:05:25.623-08:00 level=INFO source=server.go:626 msg="llama runner started in 6.28 seconds"
+Mar 05 14:05:25 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:05:25 | 200 |  6.455429877s |       127.0.0.1 | POST     "/api/generate"
+Mar 05 14:06:09 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:06:09 | 200 |  8.624030612s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:08:13 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:08:13 | 200 |  6.101035016s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:10:03 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:10:03 | 200 |  7.784464299s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.651-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9946464256 required="6.2 GiB"
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.651-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.5 GiB" free_swap="68.9 GiB"
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.651-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.652-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39435"
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.652-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.652-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.652-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 14:17:19 launchpad ollama[85535]: INFO [main] build info | build=0 commit="unknown" tid="139944768126976" timestamp=1741213039
+Mar 05 14:17:19 launchpad ollama[85535]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139944768126976" timestamp=1741213039 total_threads=16
+Mar 05 14:17:19 launchpad ollama[85535]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39435" tid="139944768126976" timestamp=1741213039
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 14:17:19 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 14:17:19 launchpad ollama[1574]: time=2025-03-05T14:17:19.903-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 14:17:19 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 14:17:19 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 14:17:19 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 14:17:19 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 14:17:19 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 14:17:20 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 14:17:20 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 14:17:20 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 14:17:20 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 14:17:20 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 14:17:20 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 14:17:20 launchpad ollama[85535]: INFO [main] model loaded | tid="139944768126976" timestamp=1741213040
+Mar 05 14:17:20 launchpad ollama[1574]: time=2025-03-05T14:17:20.656-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 14:17:30 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:17:30 | 200 | 10.994111478s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:21:33 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:21:33 | 200 | 12.040119362s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:24:51 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:24:51 | 200 |  9.340320565s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:27:26 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:27:26 | 200 |  9.939923048s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.357-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9950003200 required="6.2 GiB"
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.357-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.357-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.358-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34681"
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.358-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.358-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.358-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 14:34:42 launchpad ollama[88156]: INFO [main] build info | build=0 commit="unknown" tid="139831697833984" timestamp=1741214082
+Mar 05 14:34:42 launchpad ollama[88156]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139831697833984" timestamp=1741214082 total_threads=16
+Mar 05 14:34:42 launchpad ollama[88156]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34681" tid="139831697833984" timestamp=1741214082
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 14:34:42 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 14:34:42 launchpad ollama[1574]: time=2025-03-05T14:34:42.609-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 14:34:42 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 14:34:42 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 14:34:42 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 14:34:42 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 14:34:42 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 14:34:43 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 14:34:43 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 14:34:43 launchpad ollama[88156]: INFO [main] model loaded | tid="139831697833984" timestamp=1741214083
+Mar 05 14:34:43 launchpad ollama[1574]: time=2025-03-05T14:34:43.363-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 14:34:52 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:34:52 | 200 |  9.988378986s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.690-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9952231424 required="6.2 GiB"
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.690-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.690-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.691-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44085"
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.691-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.691-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.691-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 14:46:44 launchpad ollama[90026]: INFO [main] build info | build=0 commit="unknown" tid="140117391159296" timestamp=1741214804
+Mar 05 14:46:44 launchpad ollama[90026]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140117391159296" timestamp=1741214804 total_threads=16
+Mar 05 14:46:44 launchpad ollama[90026]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44085" tid="140117391159296" timestamp=1741214804
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 14:46:44 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 14:46:44 launchpad ollama[1574]: time=2025-03-05T14:46:44.942-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 14:46:44 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 14:46:44 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 14:46:44 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 14:46:44 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 14:46:44 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 14:46:45 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 14:46:45 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 14:46:45 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 14:46:45 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 14:46:45 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 14:46:45 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 14:46:45 launchpad ollama[90026]: INFO [main] model loaded | tid="140117391159296" timestamp=1741214805
+Mar 05 14:46:45 launchpad ollama[1574]: time=2025-03-05T14:46:45.696-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 14:46:52 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:46:52 | 200 |  8.298066043s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 14:50:48 launchpad ollama[1574]: [GIN] 2025/03/05 - 14:50:48 | 200 |  8.191092759s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.983-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9947906048 required="6.2 GiB"
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.983-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.5 GiB" free_swap="68.9 GiB"
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.983-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.984-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37563"
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.984-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.984-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 15:00:06 launchpad ollama[1574]: time=2025-03-05T15:00:06.984-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 15:00:07 launchpad ollama[92071]: INFO [main] build info | build=0 commit="unknown" tid="140347882876928" timestamp=1741215607
+Mar 05 15:00:07 launchpad ollama[92071]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140347882876928" timestamp=1741215607 total_threads=16
+Mar 05 15:00:07 launchpad ollama[92071]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37563" tid="140347882876928" timestamp=1741215607
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 15:00:07 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 15:00:07 launchpad ollama[1574]: time=2025-03-05T15:00:07.235-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 15:00:07 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 15:00:07 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 15:00:07 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 15:00:07 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 15:00:07 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 15:00:07 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 15:00:07 launchpad ollama[92071]: INFO [main] model loaded | tid="140347882876928" timestamp=1741215607
+Mar 05 15:00:07 launchpad ollama[1574]: time=2025-03-05T15:00:07.989-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 15:00:16 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:00:16 | 200 |  9.488140419s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.286-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9967173632 required="6.2 GiB"
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.286-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.286-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.287-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40239"
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.288-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.288-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.288-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 15:06:01 launchpad ollama[93480]: INFO [main] build info | build=0 commit="unknown" tid="139942513655808" timestamp=1741215961
+Mar 05 15:06:01 launchpad ollama[93480]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139942513655808" timestamp=1741215961 total_threads=16
+Mar 05 15:06:01 launchpad ollama[93480]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40239" tid="139942513655808" timestamp=1741215961
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 15:06:01 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 15:06:01 launchpad ollama[1574]: time=2025-03-05T15:06:01.539-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 15:06:01 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 15:06:01 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 15:06:01 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 15:06:01 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 15:06:01 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 15:06:02 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 15:06:02 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 15:06:02 launchpad ollama[93480]: INFO [main] model loaded | tid="139942513655808" timestamp=1741215962
+Mar 05 15:06:02 launchpad ollama[1574]: time=2025-03-05T15:06:02.292-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 15:06:10 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:06:10 | 200 |  9.075859834s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.322-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9986179072 required="6.2 GiB"
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.322-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.323-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.323-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34679"
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.324-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.324-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.324-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 15:18:10 launchpad ollama[95509]: INFO [main] build info | build=0 commit="unknown" tid="139884592377856" timestamp=1741216690
+Mar 05 15:18:10 launchpad ollama[95509]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139884592377856" timestamp=1741216690 total_threads=16
+Mar 05 15:18:10 launchpad ollama[95509]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34679" tid="139884592377856" timestamp=1741216690
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 15:18:10 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 15:18:10 launchpad ollama[1574]: time=2025-03-05T15:18:10.574-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 15:18:10 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 15:18:10 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 15:18:10 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 15:18:10 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 15:18:10 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 15:18:11 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 15:18:11 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 15:18:11 launchpad ollama[95509]: INFO [main] model loaded | tid="139884592377856" timestamp=1741216691
+Mar 05 15:18:11 launchpad ollama[1574]: time=2025-03-05T15:18:11.327-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 15:18:16 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:18:16 | 200 |  6.278557801s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:20:01 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:20:01 | 200 |  3.639772193s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:22:36 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:22:36 | 200 |  2.641605505s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.209-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9964879872 required="6.2 GiB"
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.209-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.1 GiB" free_swap="68.9 GiB"
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.209-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.210-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34753"
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.210-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.210-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.211-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 15:29:43 launchpad ollama[97994]: INFO [main] build info | build=0 commit="unknown" tid="140433811144704" timestamp=1741217383
+Mar 05 15:29:43 launchpad ollama[97994]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140433811144704" timestamp=1741217383 total_threads=16
+Mar 05 15:29:43 launchpad ollama[97994]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34753" tid="140433811144704" timestamp=1741217383
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 15:29:43 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 15:29:43 launchpad ollama[1574]: time=2025-03-05T15:29:43.462-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 15:29:43 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 15:29:43 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 15:29:43 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 15:29:43 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 15:29:43 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 15:29:44 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 15:29:44 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 15:29:44 launchpad ollama[97994]: INFO [main] model loaded | tid="140433811144704" timestamp=1741217384
+Mar 05 15:29:44 launchpad ollama[1574]: time=2025-03-05T15:29:44.214-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 15:29:49 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:29:49 | 200 |   6.96253847s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:31:12 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:31:12 | 200 |  5.408200654s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.279-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9963241472 required="6.2 GiB"
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.279-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.4 GiB" free_swap="68.9 GiB"
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.280-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.281-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37607"
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.281-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.281-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.281-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 15:45:10 launchpad ollama[100524]: INFO [main] build info | build=0 commit="unknown" tid="139922420887552" timestamp=1741218310
+Mar 05 15:45:10 launchpad ollama[100524]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139922420887552" timestamp=1741218310 total_threads=16
+Mar 05 15:45:10 launchpad ollama[100524]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37607" tid="139922420887552" timestamp=1741218310
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 15:45:10 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 15:45:10 launchpad ollama[1574]: time=2025-03-05T15:45:10.532-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 15:45:10 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 15:45:10 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 15:45:10 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 15:45:10 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 15:45:10 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 15:45:11 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 15:45:11 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 15:45:11 launchpad ollama[100524]: INFO [main] model loaded | tid="139922420887552" timestamp=1741218311
+Mar 05 15:45:11 launchpad ollama[1574]: time=2025-03-05T15:45:11.285-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 15:45:17 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:45:17 | 200 |  7.786290161s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:50:04 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:50:04 | 200 |  6.974356031s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:51:54 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:51:54 | 200 |  5.587612998s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:53:05 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:53:05 | 200 |  7.752311164s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:55:45 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:55:45 | 200 |   5.73340252s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 15:59:21 launchpad ollama[1574]: [GIN] 2025/03/05 - 15:59:21 | 200 |  3.732399405s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.727-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9962913792 required="6.2 GiB"
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.727-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.4 GiB" free_swap="68.9 GiB"
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.727-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.728-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39665"
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.728-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.728-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.728-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 16:05:25 launchpad ollama[103480]: INFO [main] build info | build=0 commit="unknown" tid="140386877947904" timestamp=1741219525
+Mar 05 16:05:25 launchpad ollama[103480]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140386877947904" timestamp=1741219525 total_threads=16
+Mar 05 16:05:25 launchpad ollama[103480]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39665" tid="140386877947904" timestamp=1741219525
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 16:05:25 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 16:05:25 launchpad ollama[1574]: time=2025-03-05T16:05:25.979-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 16:05:25 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 16:05:26 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 16:05:26 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 16:05:26 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 16:05:26 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 16:05:26 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 16:05:26 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 16:05:26 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 16:05:26 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 16:05:26 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 16:05:26 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 16:05:26 launchpad ollama[103480]: INFO [main] model loaded | tid="140386877947904" timestamp=1741219526
+Mar 05 16:05:26 launchpad ollama[1574]: time=2025-03-05T16:05:26.732-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 05 16:05:32 launchpad ollama[1574]: [GIN] 2025/03/05 - 16:05:32 | 200 |  6.557560381s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.914-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9941549056 required="6.2 GiB"
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.914-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.3 GiB" free_swap="68.9 GiB"
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.914-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.915-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39547"
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.915-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.915-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 05 16:24:38 launchpad ollama[1574]: time=2025-03-05T16:24:38.915-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 05 16:24:38 launchpad ollama[106740]: INFO [main] build info | build=0 commit="unknown" tid="140420897046528" timestamp=1741220678
+Mar 05 16:24:38 launchpad ollama[106740]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140420897046528" timestamp=1741220678 total_threads=16
+Mar 05 16:24:38 launchpad ollama[106740]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39547" tid="140420897046528" timestamp=1741220678
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 05 16:24:38 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 05 16:24:39 launchpad ollama[1574]: time=2025-03-05T16:24:39.167-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 05 16:24:39 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 05 16:24:39 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 05 16:24:39 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 05 16:24:39 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 05 16:24:39 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 05 16:24:39 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 05 16:24:39 launchpad ollama[106740]: INFO [main] model loaded | tid="140420897046528" timestamp=1741220679
+Mar 05 16:24:39 launchpad ollama[1574]: time=2025-03-05T16:24:39.920-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 05 16:24:47 launchpad ollama[1574]: [GIN] 2025/03/05 - 16:24:47 | 200 |  8.812184503s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 16:25:41 launchpad ollama[1574]: [GIN] 2025/03/05 - 16:25:41 | 200 |   6.36861991s |       127.0.0.1 | POST     "/api/chat"
+Mar 05 16:30:10 launchpad ollama[1574]: [GIN] 2025/03/05 - 16:30:10 | 200 |  4.117289805s |       127.0.0.1 | POST     "/api/chat"
+Mar 07 09:11:26 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:11:26 | 200 |      16.604µs |       127.0.0.1 | HEAD     "/"
+Mar 07 09:11:26 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:11:26 | 200 |   13.376506ms |       127.0.0.1 | POST     "/api/show"
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.632-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9693429760 required="6.2 GiB"
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.633-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.633-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.634-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42183"
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.634-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.634-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.634-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 07 09:11:26 launchpad ollama[215099]: INFO [main] build info | build=0 commit="unknown" tid="140457191243776" timestamp=1741367486
+Mar 07 09:11:26 launchpad ollama[215099]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140457191243776" timestamp=1741367486 total_threads=16
+Mar 07 09:11:26 launchpad ollama[215099]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42183" tid="140457191243776" timestamp=1741367486
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 07 09:11:26 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 07 09:11:26 launchpad ollama[1574]: time=2025-03-07T09:11:26.885-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 07 09:11:26 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 07 09:11:26 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 07 09:11:26 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 07 09:11:26 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 07 09:11:26 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 07 09:11:27 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 07 09:11:27 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 07 09:11:27 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 07 09:11:27 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 07 09:11:27 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 07 09:11:27 launchpad ollama[215099]: INFO [main] model loaded | tid="140457191243776" timestamp=1741367487
+Mar 07 09:11:27 launchpad ollama[1574]: time=2025-03-07T09:11:27.638-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 07 09:11:27 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:11:27 | 200 |  1.178096763s |       127.0.0.1 | POST     "/api/generate"
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.987-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9706602496 required="6.2 GiB"
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.987-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.987-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.988-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40441"
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.989-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.989-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 07 09:16:44 launchpad ollama[1574]: time=2025-03-07T09:16:44.989-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 07 09:16:45 launchpad ollama[215936]: INFO [main] build info | build=0 commit="unknown" tid="140543773380608" timestamp=1741367805
+Mar 07 09:16:45 launchpad ollama[215936]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140543773380608" timestamp=1741367805 total_threads=16
+Mar 07 09:16:45 launchpad ollama[215936]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40441" tid="140543773380608" timestamp=1741367805
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 07 09:16:45 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 07 09:16:45 launchpad ollama[1574]: time=2025-03-07T09:16:45.239-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 07 09:16:45 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 07 09:16:45 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 07 09:16:45 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 07 09:16:45 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 07 09:16:45 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 07 09:16:45 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 07 09:16:45 launchpad ollama[215936]: INFO [main] model loaded | tid="140543773380608" timestamp=1741367805
+Mar 07 09:16:45 launchpad ollama[1574]: time=2025-03-07T09:16:45.994-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 07 09:16:55 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:16:55 | 200 | 10.219142304s |       127.0.0.1 | POST     "/api/chat"
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.641-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9727442944 required="6.2 GiB"
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.642-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.642-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.643-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40913"
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.643-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.643-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.643-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 07 09:32:27 launchpad ollama[218242]: INFO [main] build info | build=0 commit="unknown" tid="140230817394688" timestamp=1741368747
+Mar 07 09:32:27 launchpad ollama[218242]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140230817394688" timestamp=1741368747 total_threads=16
+Mar 07 09:32:27 launchpad ollama[218242]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40913" tid="140230817394688" timestamp=1741368747
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 07 09:32:27 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 07 09:32:27 launchpad ollama[1574]: time=2025-03-07T09:32:27.895-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 07 09:32:27 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 07 09:32:27 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 07 09:32:27 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 07 09:32:27 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 07 09:32:27 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 07 09:32:28 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 07 09:32:28 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 07 09:32:28 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 07 09:32:28 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 07 09:32:28 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 07 09:32:28 launchpad ollama[218242]: INFO [main] model loaded | tid="140230817394688" timestamp=1741368748
+Mar 07 09:32:28 launchpad ollama[1574]: time=2025-03-07T09:32:28.649-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 07 09:32:37 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:32:37 | 200 |  9.888731563s |       127.0.0.1 | POST     "/api/chat"
+Mar 07 09:35:12 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:35:12 | 200 |  9.346114096s |       127.0.0.1 | POST     "/api/chat"
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.962-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9768140800 required="6.2 GiB"
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.962-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.962-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.963-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34641"
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.964-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.964-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 07 09:45:15 launchpad ollama[1574]: time=2025-03-07T09:45:15.964-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 07 09:45:15 launchpad ollama[220114]: INFO [main] build info | build=0 commit="unknown" tid="140378217746432" timestamp=1741369515
+Mar 07 09:45:15 launchpad ollama[220114]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140378217746432" timestamp=1741369515 total_threads=16
+Mar 07 09:45:15 launchpad ollama[220114]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34641" tid="140378217746432" timestamp=1741369515
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 07 09:45:16 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 07 09:45:16 launchpad ollama[1574]: time=2025-03-07T09:45:16.215-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 07 09:45:16 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 07 09:45:16 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 07 09:45:16 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 07 09:45:16 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 07 09:45:16 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 07 09:45:16 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 07 09:45:16 launchpad ollama[220114]: INFO [main] model loaded | tid="140378217746432" timestamp=1741369516
+Mar 07 09:45:16 launchpad ollama[1574]: time=2025-03-07T09:45:16.968-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 07 09:45:25 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:45:25 | 200 |  9.994966063s |       127.0.0.1 | POST     "/api/chat"
+Mar 07 09:48:48 launchpad ollama[1574]: [GIN] 2025/03/07 - 09:48:48 | 200 |  4.196691165s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:24:07 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:24:07 | 200 |       14.39µs |       127.0.0.1 | HEAD     "/"
+Mar 08 13:24:07 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:24:07 | 200 |   12.475666ms |       127.0.0.1 | POST     "/api/show"
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.942-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9611706368 required="6.2 GiB"
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.942-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.942-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.944-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40479"
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.944-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.944-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 13:24:07 launchpad ollama[1574]: time=2025-03-08T13:24:07.944-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 13:24:07 launchpad ollama[326716]: INFO [main] build info | build=0 commit="unknown" tid="139759328415744" timestamp=1741469047
+Mar 08 13:24:07 launchpad ollama[326716]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139759328415744" timestamp=1741469047 total_threads=16
+Mar 08 13:24:07 launchpad ollama[326716]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40479" tid="139759328415744" timestamp=1741469047
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 13:24:07 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 13:24:08 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 13:24:08 launchpad ollama[1574]: time=2025-03-08T13:24:08.195-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 13:24:08 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 13:24:08 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 13:24:08 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 13:24:08 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 13:24:08 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 13:24:08 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 13:24:08 launchpad ollama[326716]: INFO [main] model loaded | tid="139759328415744" timestamp=1741469048
+Mar 08 13:24:08 launchpad ollama[1574]: time=2025-03-08T13:24:08.949-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 13:24:08 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:24:08 | 200 |  1.183460126s |       127.0.0.1 | POST     "/api/generate"
+Mar 08 13:27:47 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:27:47 | 200 |  1.094116677s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:28:25 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:28:25 | 200 |  4.521793374s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:29:51 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:29:51 | 200 |  7.506289437s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:32:15 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:32:15 | 200 |  6.663518074s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.636-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9673506816 required="6.2 GiB"
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.636-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.636-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.637-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33731"
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.637-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.637-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.637-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 13:37:16 launchpad ollama[328680]: INFO [main] build info | build=0 commit="unknown" tid="140209512603648" timestamp=1741469836
+Mar 08 13:37:16 launchpad ollama[328680]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140209512603648" timestamp=1741469836 total_threads=16
+Mar 08 13:37:16 launchpad ollama[328680]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33731" tid="140209512603648" timestamp=1741469836
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 13:37:16 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 13:37:16 launchpad ollama[1574]: time=2025-03-08T13:37:16.889-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 13:37:16 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 13:37:16 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 13:37:16 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 13:37:16 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 13:37:16 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 13:37:17 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 13:37:17 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 13:37:17 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 13:37:17 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 13:37:17 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 13:37:17 launchpad ollama[328680]: INFO [main] model loaded | tid="140209512603648" timestamp=1741469837
+Mar 08 13:37:17 launchpad ollama[1574]: time=2025-03-08T13:37:17.643-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 08 13:37:28 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:37:28 | 200 | 12.164662172s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:42:00 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:42:00 | 200 |  2.284449518s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:43:15 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:43:15 | 200 |   1.23992856s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:44:55 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:44:55 | 200 |  4.428215426s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.810-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9673834496 required="6.2 GiB"
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.810-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.810-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.811-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46137"
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.811-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.811-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 13:51:47 launchpad ollama[1574]: time=2025-03-08T13:51:47.811-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 13:51:47 launchpad ollama[330837]: INFO [main] build info | build=0 commit="unknown" tid="139948252450816" timestamp=1741470707
+Mar 08 13:51:47 launchpad ollama[330837]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139948252450816" timestamp=1741470707 total_threads=16
+Mar 08 13:51:47 launchpad ollama[330837]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46137" tid="139948252450816" timestamp=1741470707
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 13:51:47 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 13:51:48 launchpad ollama[1574]: time=2025-03-08T13:51:48.063-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 13:51:48 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 13:51:48 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 13:51:48 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 13:51:48 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 13:51:48 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 13:51:48 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 13:51:48 launchpad ollama[330837]: INFO [main] model loaded | tid="139948252450816" timestamp=1741470708
+Mar 08 13:51:48 launchpad ollama[1574]: time=2025-03-08T13:51:48.816-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 13:51:54 launchpad ollama[1574]: [GIN] 2025/03/08 - 13:51:54 | 200 |  6.439826955s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.498-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9608953856 required="6.2 GiB"
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.498-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.498-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.499-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44841"
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.499-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.499-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.499-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 14:02:19 launchpad ollama[332538]: INFO [main] build info | build=0 commit="unknown" tid="140476464001024" timestamp=1741471339
+Mar 08 14:02:19 launchpad ollama[332538]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140476464001024" timestamp=1741471339 total_threads=16
+Mar 08 14:02:19 launchpad ollama[332538]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44841" tid="140476464001024" timestamp=1741471339
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 14:02:19 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 14:02:19 launchpad ollama[1574]: time=2025-03-08T14:02:19.750-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 14:02:19 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 14:02:19 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 14:02:19 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 14:02:19 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 14:02:19 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 14:02:20 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 14:02:20 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 14:02:20 launchpad ollama[332538]: INFO [main] model loaded | tid="140476464001024" timestamp=1741471340
+Mar 08 14:02:20 launchpad ollama[1574]: time=2025-03-08T14:02:20.504-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 14:02:26 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:02:26 | 200 |  7.576100041s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:06:41 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:06:41 | 200 |  5.728637896s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:09:37 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:09:37 | 200 |  7.661746806s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:11:33 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:11:33 | 200 |  4.219460673s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:13:03 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:13:03 | 200 |  6.952883113s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.763-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9704505344 required="6.2 GiB"
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.764-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.7 GiB" free_swap="68.9 GiB"
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.764-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.765-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33217"
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.765-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.765-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 14:31:10 launchpad ollama[1574]: time=2025-03-08T14:31:10.765-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 14:31:10 launchpad ollama[336876]: INFO [main] build info | build=0 commit="unknown" tid="140236733587456" timestamp=1741473070
+Mar 08 14:31:10 launchpad ollama[336876]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140236733587456" timestamp=1741473070 total_threads=16
+Mar 08 14:31:10 launchpad ollama[336876]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33217" tid="140236733587456" timestamp=1741473070
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 14:31:10 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 14:31:11 launchpad ollama[1574]: time=2025-03-08T14:31:11.016-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 14:31:11 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 14:31:11 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 14:31:11 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 14:31:11 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 14:31:11 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 14:31:11 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 14:31:11 launchpad ollama[336876]: INFO [main] model loaded | tid="140236733587456" timestamp=1741473071
+Mar 08 14:31:11 launchpad ollama[1574]: time=2025-03-08T14:31:11.769-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 14:31:21 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:31:21 | 200 |  11.16257887s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.305-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9704898560 required="6.2 GiB"
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.305-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.305-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.306-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36233"
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.306-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.306-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.307-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 14:36:39 launchpad ollama[337733]: INFO [main] build info | build=0 commit="unknown" tid="140642937167872" timestamp=1741473399
+Mar 08 14:36:39 launchpad ollama[337733]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140642937167872" timestamp=1741473399 total_threads=16
+Mar 08 14:36:39 launchpad ollama[337733]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36233" tid="140642937167872" timestamp=1741473399
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 14:36:39 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 14:36:39 launchpad ollama[1574]: time=2025-03-08T14:36:39.558-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 14:36:39 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 14:36:39 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 14:36:39 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 14:36:39 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 14:36:39 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 14:36:40 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 14:36:40 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 14:36:40 launchpad ollama[337733]: INFO [main] model loaded | tid="140642937167872" timestamp=1741473400
+Mar 08 14:36:40 launchpad ollama[1574]: time=2025-03-08T14:36:40.311-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 14:36:46 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:36:46 | 200 |  7.386375645s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.578-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9705095168 required="6.2 GiB"
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.578-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.579-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.579-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36931"
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.580-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.580-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.580-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 14:47:33 launchpad ollama[339335]: INFO [main] build info | build=0 commit="unknown" tid="140399294124032" timestamp=1741474053
+Mar 08 14:47:33 launchpad ollama[339335]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140399294124032" timestamp=1741474053 total_threads=16
+Mar 08 14:47:33 launchpad ollama[339335]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36931" tid="140399294124032" timestamp=1741474053
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 14:47:33 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 14:47:33 launchpad ollama[1574]: time=2025-03-08T14:47:33.831-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 14:47:33 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 14:47:33 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 14:47:33 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 14:47:33 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 14:47:33 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 14:47:34 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 14:47:34 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 14:47:34 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 14:47:34 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 14:47:34 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 14:47:34 launchpad ollama[339335]: INFO [main] model loaded | tid="140399294124032" timestamp=1741474054
+Mar 08 14:47:34 launchpad ollama[1574]: time=2025-03-08T14:47:34.584-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 14:47:43 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:47:43 | 200 |  10.29508951s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 14:50:56 launchpad ollama[1574]: [GIN] 2025/03/08 - 14:50:56 | 200 |  8.405256104s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.312-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9687072768 required="6.2 GiB"
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.312-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.312-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.313-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37243"
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.313-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.313-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.313-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 15:01:28 launchpad ollama[341747]: INFO [main] build info | build=0 commit="unknown" tid="140708884348928" timestamp=1741474888
+Mar 08 15:01:28 launchpad ollama[341747]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140708884348928" timestamp=1741474888 total_threads=16
+Mar 08 15:01:28 launchpad ollama[341747]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37243" tid="140708884348928" timestamp=1741474888
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 15:01:28 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 15:01:28 launchpad ollama[1574]: time=2025-03-08T15:01:28.565-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 15:01:28 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 15:01:28 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 15:01:28 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 15:01:28 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 15:01:28 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 15:01:29 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 15:01:29 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 15:01:29 launchpad ollama[341747]: INFO [main] model loaded | tid="140708884348928" timestamp=1741474889
+Mar 08 15:01:29 launchpad ollama[1574]: time=2025-03-08T15:01:29.317-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 15:01:35 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:01:35 | 200 |  6.908397259s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:05:13 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:05:13 | 200 |  2.445630051s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:05:40 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:05:40 | 200 |  5.419790376s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:09:28 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:09:28 | 200 |   6.92806316s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.053-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9687400448 required="6.2 GiB"
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.053-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.054-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.055-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46617"
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.055-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.055-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.055-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 15:15:38 launchpad ollama[343797]: INFO [main] build info | build=0 commit="unknown" tid="140497237594112" timestamp=1741475738
+Mar 08 15:15:38 launchpad ollama[343797]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140497237594112" timestamp=1741475738 total_threads=16
+Mar 08 15:15:38 launchpad ollama[343797]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46617" tid="140497237594112" timestamp=1741475738
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 15:15:38 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 15:15:38 launchpad ollama[1574]: time=2025-03-08T15:15:38.306-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 15:15:38 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 15:15:38 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 15:15:38 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 15:15:38 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 15:15:38 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 15:15:38 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 15:15:39 launchpad ollama[343797]: INFO [main] model loaded | tid="140497237594112" timestamp=1741475739
+Mar 08 15:15:39 launchpad ollama[1574]: time=2025-03-08T15:15:39.060-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 15:15:47 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:15:47 | 200 |  9.394842213s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.791-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9692184576 required="6.2 GiB"
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.791-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.791-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.793-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42417"
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.793-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.793-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 15:24:59 launchpad ollama[1574]: time=2025-03-08T15:24:59.793-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 15:24:59 launchpad ollama[345242]: INFO [main] build info | build=0 commit="unknown" tid="140415295508480" timestamp=1741476299
+Mar 08 15:24:59 launchpad ollama[345242]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140415295508480" timestamp=1741476299 total_threads=16
+Mar 08 15:24:59 launchpad ollama[345242]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42417" tid="140415295508480" timestamp=1741476299
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 15:24:59 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 15:25:00 launchpad ollama[1574]: time=2025-03-08T15:25:00.044-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 15:25:00 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 15:25:00 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 15:25:00 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 15:25:00 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 15:25:00 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 15:25:00 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 15:25:00 launchpad ollama[345242]: INFO [main] model loaded | tid="140415295508480" timestamp=1741476300
+Mar 08 15:25:00 launchpad ollama[1574]: time=2025-03-08T15:25:00.798-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 08 15:25:10 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:25:10 | 200 | 10.482071839s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.452-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9692119040 required="6.2 GiB"
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.452-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.452-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.453-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39793"
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.453-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.453-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.453-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 15:31:48 launchpad ollama[346269]: INFO [main] build info | build=0 commit="unknown" tid="139987294150656" timestamp=1741476708
+Mar 08 15:31:48 launchpad ollama[346269]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139987294150656" timestamp=1741476708 total_threads=16
+Mar 08 15:31:48 launchpad ollama[346269]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39793" tid="139987294150656" timestamp=1741476708
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 15:31:48 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 15:31:48 launchpad ollama[1574]: time=2025-03-08T15:31:48.704-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 15:31:48 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 15:31:48 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 15:31:48 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 15:31:48 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 15:31:48 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 15:31:49 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 15:31:49 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 15:31:49 launchpad ollama[346269]: INFO [main] model loaded | tid="139987294150656" timestamp=1741476709
+Mar 08 15:31:49 launchpad ollama[1574]: time=2025-03-08T15:31:49.456-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 15:31:56 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:31:56 | 200 |  8.241596791s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:34:00 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:34:00 | 200 |  4.559451795s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.873-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9697034240 required="6.2 GiB"
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.873-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.874-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.875-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45995"
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.875-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.875-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 15:44:34 launchpad ollama[1574]: time=2025-03-08T15:44:34.875-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 15:44:34 launchpad ollama[348127]: INFO [main] build info | build=0 commit="unknown" tid="139937916739584" timestamp=1741477474
+Mar 08 15:44:34 launchpad ollama[348127]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139937916739584" timestamp=1741477474 total_threads=16
+Mar 08 15:44:34 launchpad ollama[348127]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45995" tid="139937916739584" timestamp=1741477474
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 15:44:34 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 15:44:35 launchpad ollama[1574]: time=2025-03-08T15:44:35.126-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 15:44:35 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 15:44:35 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 15:44:35 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 15:44:35 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 15:44:35 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 15:44:35 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 15:44:35 launchpad ollama[348127]: INFO [main] model loaded | tid="139937916739584" timestamp=1741477475
+Mar 08 15:44:35 launchpad ollama[1574]: time=2025-03-08T15:44:35.879-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 15:44:43 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:44:43 | 200 |  8.892522358s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.638-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9696509952 required="6.2 GiB"
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.638-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.638-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.639-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42471"
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.639-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.639-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.639-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 15:53:36 launchpad ollama[349479]: INFO [main] build info | build=0 commit="unknown" tid="140647372591104" timestamp=1741478016
+Mar 08 15:53:36 launchpad ollama[349479]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140647372591104" timestamp=1741478016 total_threads=16
+Mar 08 15:53:36 launchpad ollama[349479]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42471" tid="140647372591104" timestamp=1741478016
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 15:53:36 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 15:53:36 launchpad ollama[1574]: time=2025-03-08T15:53:36.890-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 15:53:36 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 15:53:36 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 15:53:36 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 15:53:36 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 15:53:36 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 15:53:37 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 15:53:37 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 15:53:37 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 15:53:37 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 15:53:37 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 15:53:37 launchpad ollama[349479]: INFO [main] model loaded | tid="140647372591104" timestamp=1741478017
+Mar 08 15:53:37 launchpad ollama[1574]: time=2025-03-08T15:53:37.642-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 15:53:47 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:53:47 | 200 | 10.553367802s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 15:58:00 launchpad ollama[1574]: [GIN] 2025/03/08 - 15:58:00 | 200 | 11.997268489s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:00:39 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:00:39 | 200 |  7.990518084s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:02:15 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:02:15 | 200 | 10.239060648s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.043-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9697034240 required="6.2 GiB"
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.043-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.043-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.044-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36165"
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.044-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.044-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.044-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 16:09:30 launchpad ollama[351795]: INFO [main] build info | build=0 commit="unknown" tid="140092657197056" timestamp=1741478970
+Mar 08 16:09:30 launchpad ollama[351795]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140092657197056" timestamp=1741478970 total_threads=16
+Mar 08 16:09:30 launchpad ollama[351795]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36165" tid="140092657197056" timestamp=1741478970
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 16:09:30 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 16:09:30 launchpad ollama[1574]: time=2025-03-08T16:09:30.295-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 16:09:30 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 16:09:30 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 16:09:30 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 16:09:30 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 16:09:30 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 16:09:30 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 16:09:31 launchpad ollama[351795]: INFO [main] model loaded | tid="140092657197056" timestamp=1741478971
+Mar 08 16:09:31 launchpad ollama[1574]: time=2025-03-08T16:09:31.048-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 16:09:41 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:09:41 | 200 | 11.183100395s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.945-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9696509952 required="6.2 GiB"
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.945-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.946-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.946-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39539"
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.947-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.947-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 16:19:07 launchpad ollama[1574]: time=2025-03-08T16:19:07.947-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 16:19:07 launchpad ollama[353210]: INFO [main] build info | build=0 commit="unknown" tid="140416039989248" timestamp=1741479547
+Mar 08 16:19:07 launchpad ollama[353210]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140416039989248" timestamp=1741479547 total_threads=16
+Mar 08 16:19:07 launchpad ollama[353210]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39539" tid="140416039989248" timestamp=1741479547
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 16:19:07 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 16:19:08 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 16:19:08 launchpad ollama[1574]: time=2025-03-08T16:19:08.197-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 16:19:08 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 16:19:08 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 16:19:08 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 16:19:08 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 16:19:08 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 16:19:08 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 16:19:08 launchpad ollama[353210]: INFO [main] model loaded | tid="140416039989248" timestamp=1741479548
+Mar 08 16:19:08 launchpad ollama[1574]: time=2025-03-08T16:19:08.951-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 16:19:18 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:19:18 | 200 | 10.486038057s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.577-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9696837632 required="6.2 GiB"
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.577-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.577-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.578-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41655"
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.578-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.578-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.578-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 16:32:36 launchpad ollama[355180]: INFO [main] build info | build=0 commit="unknown" tid="140339924946944" timestamp=1741480356
+Mar 08 16:32:36 launchpad ollama[355180]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140339924946944" timestamp=1741480356 total_threads=16
+Mar 08 16:32:36 launchpad ollama[355180]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41655" tid="140339924946944" timestamp=1741480356
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 16:32:36 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 16:32:36 launchpad ollama[1574]: time=2025-03-08T16:32:36.829-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 16:32:36 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 16:32:36 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 16:32:36 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 16:32:36 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 16:32:36 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 16:32:37 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 16:32:37 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 16:32:37 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 16:32:37 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 16:32:37 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 16:32:37 launchpad ollama[355180]: INFO [main] model loaded | tid="140339924946944" timestamp=1741480357
+Mar 08 16:32:37 launchpad ollama[1574]: time=2025-03-08T16:32:37.583-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 16:32:47 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:32:47 | 200 | 11.114139548s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:37:25 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:37:25 | 200 |  9.219364707s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.048-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9702342656 required="6.2 GiB"
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.048-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.048-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.049-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37857"
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.049-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.049-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.050-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 16:45:42 launchpad ollama[357133]: INFO [main] build info | build=0 commit="unknown" tid="140671976652800" timestamp=1741481142
+Mar 08 16:45:42 launchpad ollama[357133]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140671976652800" timestamp=1741481142 total_threads=16
+Mar 08 16:45:42 launchpad ollama[357133]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37857" tid="140671976652800" timestamp=1741481142
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 16:45:42 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 16:45:42 launchpad ollama[1574]: time=2025-03-08T16:45:42.300-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 16:45:42 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 16:45:42 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 16:45:42 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 16:45:42 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 16:45:42 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 16:45:42 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 16:45:43 launchpad ollama[357133]: INFO [main] model loaded | tid="140671976652800" timestamp=1741481143
+Mar 08 16:45:43 launchpad ollama[1574]: time=2025-03-08T16:45:43.054-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 16:45:52 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:45:52 | 200 | 10.725293793s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.031-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9702277120 required="6.2 GiB"
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.031-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.031-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.032-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37797"
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.032-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.032-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.032-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 16:54:26 launchpad ollama[358418]: INFO [main] build info | build=0 commit="unknown" tid="140605006110720" timestamp=1741481666
+Mar 08 16:54:26 launchpad ollama[358418]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140605006110720" timestamp=1741481666 total_threads=16
+Mar 08 16:54:26 launchpad ollama[358418]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37797" tid="140605006110720" timestamp=1741481666
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 16:54:26 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 16:54:26 launchpad ollama[1574]: time=2025-03-08T16:54:26.284-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 16:54:26 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 16:54:26 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 16:54:26 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 16:54:26 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 16:54:26 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 16:54:26 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 16:54:27 launchpad ollama[358418]: INFO [main] model loaded | tid="140605006110720" timestamp=1741481667
+Mar 08 16:54:27 launchpad ollama[1574]: time=2025-03-08T16:54:27.038-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 08 16:54:37 launchpad ollama[1574]: [GIN] 2025/03/08 - 16:54:37 | 200 | 11.798630458s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.070-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9657909248 required="6.2 GiB"
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.070-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.8 GiB" free_swap="68.9 GiB"
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.070-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.071-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37719"
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.071-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.071-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.071-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 17:05:00 launchpad ollama[360343]: INFO [main] build info | build=0 commit="unknown" tid="140412091461632" timestamp=1741482300
+Mar 08 17:05:00 launchpad ollama[360343]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140412091461632" timestamp=1741482300 total_threads=16
+Mar 08 17:05:00 launchpad ollama[360343]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37719" tid="140412091461632" timestamp=1741482300
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 17:05:00 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 17:05:00 launchpad ollama[1574]: time=2025-03-08T17:05:00.322-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 17:05:00 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 17:05:00 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 17:05:00 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 17:05:00 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 17:05:00 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 17:05:00 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 17:05:01 launchpad ollama[360343]: INFO [main] model loaded | tid="140412091461632" timestamp=1741482301
+Mar 08 17:05:01 launchpad ollama[1574]: time=2025-03-08T17:05:01.075-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 17:05:10 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:05:10 | 200 | 11.046435631s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:08:22 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:08:22 | 200 |  7.973867747s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:12:24 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:12:24 | 200 |  8.143167313s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:13:42 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:13:42 | 200 | 10.419543506s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:17:01 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:17:01 | 200 |  9.017646314s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:18:36 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:18:36 | 200 |  7.896930732s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:19:37 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:19:37 | 200 |  9.521671874s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:22:54 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:22:54 | 200 |  7.680583303s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:27:08 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:27:08 | 200 |  8.928069267s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:28:26 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:28:26 | 200 |   7.56655869s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:31:04 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:31:04 | 200 | 10.575788818s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.226-08:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9706471424 required="6.2 GiB"
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.226-08:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.226-08:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.227-08:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36051"
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.227-08:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.228-08:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.228-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 08 17:53:12 launchpad ollama[367363]: INFO [main] build info | build=0 commit="unknown" tid="139895185006592" timestamp=1741485192
+Mar 08 17:53:12 launchpad ollama[367363]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139895185006592" timestamp=1741485192 total_threads=16
+Mar 08 17:53:12 launchpad ollama[367363]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36051" tid="139895185006592" timestamp=1741485192
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 08 17:53:12 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 08 17:53:12 launchpad ollama[1574]: time=2025-03-08T17:53:12.478-08:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 08 17:53:12 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 08 17:53:12 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 08 17:53:12 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 08 17:53:12 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 08 17:53:12 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 08 17:53:13 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 08 17:53:13 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 08 17:53:13 launchpad ollama[367363]: INFO [main] model loaded | tid="139895185006592" timestamp=1741485193
+Mar 08 17:53:13 launchpad ollama[1574]: time=2025-03-08T17:53:13.231-08:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 08 17:53:17 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:53:17 | 200 |  5.336332001s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:55:09 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:55:09 | 200 |  2.911486641s |       127.0.0.1 | POST     "/api/chat"
+Mar 08 17:57:36 launchpad ollama[1574]: [GIN] 2025/03/08 - 17:57:36 | 200 |   3.86944111s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:18:45 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:18:45 | 200 |      15.084µs |       127.0.0.1 | HEAD     "/"
+Mar 11 13:18:45 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:18:45 | 200 |   12.645877ms |       127.0.0.1 | POST     "/api/show"
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.472-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9554821120 required="6.2 GiB"
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.472-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.0 GiB" free_swap="68.9 GiB"
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.472-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.474-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45623"
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.474-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.474-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.474-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 13:18:45 launchpad ollama[480392]: INFO [main] build info | build=0 commit="unknown" tid="140425920761856" timestamp=1741724325
+Mar 11 13:18:45 launchpad ollama[480392]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140425920761856" timestamp=1741724325 total_threads=16
+Mar 11 13:18:45 launchpad ollama[480392]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45623" tid="140425920761856" timestamp=1741724325
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 13:18:45 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 13:18:45 launchpad ollama[1574]: time=2025-03-11T13:18:45.725-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 13:18:45 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 13:18:45 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 13:18:45 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 13:18:45 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 13:18:45 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 13:18:46 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 13:18:46 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 13:18:46 launchpad ollama[480392]: INFO [main] model loaded | tid="140425920761856" timestamp=1741724326
+Mar 11 13:18:46 launchpad ollama[1574]: time=2025-03-11T13:18:46.478-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 13:18:46 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:18:46 | 200 |  1.178127813s |       127.0.0.1 | POST     "/api/generate"
+Mar 11 13:20:50 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:20:50 | 200 |  8.295163398s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.049-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9596895232 required="6.2 GiB"
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.049-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.050-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.051-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39123"
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.051-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.051-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.051-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 13:26:04 launchpad ollama[481615]: INFO [main] build info | build=0 commit="unknown" tid="140584829214720" timestamp=1741724764
+Mar 11 13:26:04 launchpad ollama[481615]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140584829214720" timestamp=1741724764 total_threads=16
+Mar 11 13:26:04 launchpad ollama[481615]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39123" tid="140584829214720" timestamp=1741724764
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 13:26:04 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 13:26:04 launchpad ollama[1574]: time=2025-03-11T13:26:04.302-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 13:26:04 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 13:26:04 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 13:26:04 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 13:26:04 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 13:26:04 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 13:26:04 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 13:26:05 launchpad ollama[481615]: INFO [main] model loaded | tid="140584829214720" timestamp=1741724765
+Mar 11 13:26:05 launchpad ollama[1574]: time=2025-03-11T13:26:05.055-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 13:26:16 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:26:16 | 200 | 12.615909649s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.948-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9665445888 required="6.2 GiB"
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.948-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.9 GiB" free_swap="68.9 GiB"
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.948-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.949-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34005"
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.949-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.949-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 13:34:35 launchpad ollama[1574]: time=2025-03-11T13:34:35.949-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 13:34:35 launchpad ollama[482865]: INFO [main] build info | build=0 commit="unknown" tid="139711692562432" timestamp=1741725275
+Mar 11 13:34:35 launchpad ollama[482865]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139711692562432" timestamp=1741725275 total_threads=16
+Mar 11 13:34:35 launchpad ollama[482865]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34005" tid="139711692562432" timestamp=1741725275
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 13:34:35 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 13:34:36 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 13:34:36 launchpad ollama[1574]: time=2025-03-11T13:34:36.201-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 13:34:36 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 13:34:36 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 13:34:36 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 13:34:36 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 13:34:36 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 13:34:36 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 13:34:36 launchpad ollama[482865]: INFO [main] model loaded | tid="139711692562432" timestamp=1741725276
+Mar 11 13:34:36 launchpad ollama[1574]: time=2025-03-11T13:34:36.953-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 13:34:48 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:34:48 | 200 | 12.905559978s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:36:27 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:36:27 | 200 |  8.950251273s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:39:04 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:39:04 | 200 | 13.612695266s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.595-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9665314816 required="6.2 GiB"
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.595-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.9 GiB" free_swap="68.9 GiB"
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.595-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.596-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44719"
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.596-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.596-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.596-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 13:49:58 launchpad ollama[485128]: INFO [main] build info | build=0 commit="unknown" tid="139822015393792" timestamp=1741726198
+Mar 11 13:49:58 launchpad ollama[485128]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139822015393792" timestamp=1741726198 total_threads=16
+Mar 11 13:49:58 launchpad ollama[485128]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44719" tid="139822015393792" timestamp=1741726198
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 13:49:58 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 13:49:58 launchpad ollama[1574]: time=2025-03-11T13:49:58.847-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 13:49:58 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 13:49:58 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 13:49:58 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 13:49:58 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 13:49:58 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 13:49:59 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 13:49:59 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 13:49:59 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 13:49:59 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 13:49:59 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 13:49:59 launchpad ollama[485128]: INFO [main] model loaded | tid="139822015393792" timestamp=1741726199
+Mar 11 13:49:59 launchpad ollama[1574]: time=2025-03-11T13:49:59.600-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 13:50:12 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:50:12 | 200 | 13.774786975s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.885-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9661054976 required="6.2 GiB"
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.885-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.9 GiB" free_swap="68.9 GiB"
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.885-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.886-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42067"
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.886-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.886-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 13:58:05 launchpad ollama[1574]: time=2025-03-11T13:58:05.887-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 13:58:05 launchpad ollama[486611]: INFO [main] build info | build=0 commit="unknown" tid="139867479465984" timestamp=1741726685
+Mar 11 13:58:05 launchpad ollama[486611]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139867479465984" timestamp=1741726685 total_threads=16
+Mar 11 13:58:05 launchpad ollama[486611]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42067" tid="139867479465984" timestamp=1741726685
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 13:58:05 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 13:58:06 launchpad ollama[1574]: time=2025-03-11T13:58:06.138-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 13:58:06 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 13:58:06 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 13:58:06 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 13:58:06 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 13:58:06 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 13:58:06 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 13:58:06 launchpad ollama[486611]: INFO [main] model loaded | tid="139867479465984" timestamp=1741726686
+Mar 11 13:58:06 launchpad ollama[1574]: time=2025-03-11T13:58:06.890-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 13:58:16 launchpad ollama[1574]: [GIN] 2025/03/11 - 13:58:16 | 200 |  10.61464432s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:00:41 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:00:41 | 200 |  7.264593476s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:03:29 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:03:29 | 200 | 12.950179046s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.912-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9669115904 required="6.2 GiB"
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.912-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.9 GiB" free_swap="68.9 GiB"
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.912-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.913-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 32801"
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.913-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.913-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 14:10:05 launchpad ollama[1574]: time=2025-03-11T14:10:05.913-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 14:10:05 launchpad ollama[488380]: INFO [main] build info | build=0 commit="unknown" tid="140076242849792" timestamp=1741727405
+Mar 11 14:10:05 launchpad ollama[488380]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140076242849792" timestamp=1741727405 total_threads=16
+Mar 11 14:10:05 launchpad ollama[488380]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32801" tid="140076242849792" timestamp=1741727405
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 14:10:05 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 14:10:06 launchpad ollama[1574]: time=2025-03-11T14:10:06.165-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 14:10:06 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 14:10:06 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 14:10:06 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 14:10:06 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 14:10:06 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 14:10:06 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 14:10:06 launchpad ollama[488380]: INFO [main] model loaded | tid="140076242849792" timestamp=1741727406
+Mar 11 14:10:06 launchpad ollama[1574]: time=2025-03-11T14:10:06.917-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 14:10:12 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:10:12 | 200 |  6.487016241s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:14:25 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:14:25 | 200 |  4.391287277s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:16:10 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:16:10 | 200 |  5.606884637s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:18:51 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:18:51 | 200 | 21.000459759s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:22:53 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:22:53 | 200 | 11.938722795s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.177-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9665249280 required="6.2 GiB"
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.177-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.177-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.178-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46727"
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.178-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.178-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.178-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 14:31:15 launchpad ollama[491723]: INFO [main] build info | build=0 commit="unknown" tid="140683859144704" timestamp=1741728675
+Mar 11 14:31:15 launchpad ollama[491723]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140683859144704" timestamp=1741728675 total_threads=16
+Mar 11 14:31:15 launchpad ollama[491723]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46727" tid="140683859144704" timestamp=1741728675
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 14:31:15 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 14:31:15 launchpad ollama[1574]: time=2025-03-11T14:31:15.429-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 14:31:15 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 14:31:15 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 14:31:15 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 14:31:15 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 14:31:15 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 14:31:16 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 14:31:16 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 14:31:16 launchpad ollama[491723]: INFO [main] model loaded | tid="140683859144704" timestamp=1741728676
+Mar 11 14:31:16 launchpad ollama[1574]: time=2025-03-11T14:31:16.181-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 14:31:29 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:31:29 | 200 | 14.558833703s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:35:18 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:35:18 | 200 |  10.25274678s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:39:40 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:39:40 | 200 | 10.013498147s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.096-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9664397312 required="6.2 GiB"
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.096-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.097-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.098-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38937"
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.098-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.098-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.098-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 14:44:57 launchpad ollama[493743]: INFO [main] build info | build=0 commit="unknown" tid="140039000276992" timestamp=1741729497
+Mar 11 14:44:57 launchpad ollama[493743]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140039000276992" timestamp=1741729497 total_threads=16
+Mar 11 14:44:57 launchpad ollama[493743]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38937" tid="140039000276992" timestamp=1741729497
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 14:44:57 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 14:44:57 launchpad ollama[1574]: time=2025-03-11T14:44:57.349-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 14:44:57 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 14:44:57 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 14:44:57 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 14:44:57 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 14:44:57 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 14:44:58 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 14:44:58 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 14:44:58 launchpad ollama[493743]: INFO [main] model loaded | tid="140039000276992" timestamp=1741729498
+Mar 11 14:44:58 launchpad ollama[1574]: time=2025-03-11T14:44:58.102-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 14:45:11 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:45:11 | 200 | 14.201247672s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:47:39 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:47:39 | 200 | 11.686216522s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:49:25 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:49:25 | 200 |  2.710858779s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:50:19 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:50:19 | 200 |  4.865790386s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:51:29 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:51:29 | 200 |  9.980005186s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:52:49 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:52:49 | 200 |  9.339118759s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:53:47 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:53:47 | 200 |  6.452323377s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 14:57:59 launchpad ollama[1574]: [GIN] 2025/03/11 - 14:57:59 | 200 | 12.411862801s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.042-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9669378048 required="6.2 GiB"
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.042-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.0 GiB" free_swap="68.9 GiB"
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.043-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.044-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1010747878/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39977"
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.044-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.044-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.044-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 11 15:03:18 launchpad ollama[496459]: INFO [main] build info | build=0 commit="unknown" tid="140604860256256" timestamp=1741730598
+Mar 11 15:03:18 launchpad ollama[496459]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140604860256256" timestamp=1741730598 total_threads=16
+Mar 11 15:03:18 launchpad ollama[496459]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39977" tid="140604860256256" timestamp=1741730598
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - type  f32:   65 tensors
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - type q4_0:  225 tensors
+Mar 11 15:03:18 launchpad ollama[1574]: llama_model_loader: - type q6_K:    1 tensors
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_vocab: special tokens cache size = 256
+Mar 11 15:03:18 launchpad ollama[1574]: time=2025-03-11T15:03:18.294-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: arch             = llama
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: vocab type       = BPE
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_vocab          = 128256
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_merges         = 280147
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: vocab_only       = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_embd           = 4096
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_layer          = 32
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_head           = 32
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_head_kv        = 8
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_rot            = 128
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_swa            = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_gqa            = 4
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_ff             = 14336
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_expert         = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_expert_used    = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: causal attn      = 1
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: pooling type     = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: rope type        = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: rope scaling     = linear
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: freq_scale_train = 1
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: ssm_d_state      = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: model type       = 8B
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: model ftype      = Q4_0
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: model params     = 8.03 B
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_print_meta: max token length = 256
+Mar 11 15:03:18 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 11 15:03:18 launchpad ollama[1574]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 11 15:03:18 launchpad ollama[1574]: ggml_cuda_init: found 1 CUDA devices:
+Mar 11 15:03:18 launchpad ollama[1574]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: n_ctx      = 8192
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: n_batch    = 512
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: n_ubatch   = 512
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: flash_attn = 0
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: freq_scale = 1
+Mar 11 15:03:18 launchpad ollama[1574]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: graph nodes  = 1030
+Mar 11 15:03:18 launchpad ollama[1574]: llama_new_context_with_model: graph splits = 2
+Mar 11 15:03:19 launchpad ollama[496459]: INFO [main] model loaded | tid="140604860256256" timestamp=1741730599
+Mar 11 15:03:19 launchpad ollama[1574]: time=2025-03-11T15:03:19.047-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 11 15:03:31 launchpad ollama[1574]: [GIN] 2025/03/11 - 15:03:31 | 200 | 14.049275516s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 15:07:23 launchpad ollama[1574]: [GIN] 2025/03/11 - 15:07:23 | 200 |  6.792752956s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 15:11:30 launchpad ollama[1574]: [GIN] 2025/03/11 - 15:11:30 | 200 |  5.378376894s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 15:13:16 launchpad ollama[1574]: [GIN] 2025/03/11 - 15:13:16 | 200 | 10.550209232s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 15:16:10 launchpad ollama[1574]: [GIN] 2025/03/11 - 15:16:10 | 200 |  4.154373675s |       127.0.0.1 | POST     "/api/chat"
+Mar 11 16:50:25 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 11 16:50:25 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 11 16:50:25 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 11 16:50:25 launchpad systemd[1]: ollama.service: Consumed 16min 11.428s CPU time, 5.7G memory peak, 4.6G read from disk, 508.1M written to disk, 26.6M incoming IP traffic, 26.3M outgoing IP traffic.
+-- Boot 209de18b3b8242eb9551bbe34958d7c3 --
+Mar 12 10:23:02 launchpad systemd[1]: Starting Server for local large language models...
+Mar 12 10:23:02 launchpad systemd[1]: Started Server for local large language models.
+Mar 12 10:23:02 launchpad ollama[1578]: 2025/03/12 10:23:02 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 12 10:23:02 launchpad ollama[1578]: time=2025-03-12T10:23:02.672-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 12 10:23:02 launchpad ollama[1578]: time=2025-03-12T10:23:02.677-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 12 10:23:02 launchpad ollama[1578]: time=2025-03-12T10:23:02.680-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 12 10:23:02 launchpad ollama[1578]: time=2025-03-12T10:23:02.681-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1906912190/runners
+Mar 12 10:23:05 launchpad ollama[1578]: time=2025-03-12T10:23:05.609-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 12 10:23:05 launchpad ollama[1578]: time=2025-03-12T10:23:05.610-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 12 10:23:05 launchpad ollama[1578]: time=2025-03-12T10:23:05.611-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:23:05 launchpad ollama[1578]: time=2025-03-12T10:23:05.611-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:23:05 launchpad ollama[1578]: time=2025-03-12T10:23:05.611-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:23:05 launchpad ollama[1578]: time=2025-03-12T10:23:05.840-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 12 10:23:25 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 12 10:23:25 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 12 10:23:25 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 12 10:23:25 launchpad systemd[1]: ollama.service: Consumed 3.346s CPU time, 787M memory peak, 234.4M read from disk, 508.1M written to disk.
+-- Boot b91eb1767cd4479e933a82eeca6a29f4 --
+Mar 12 10:23:56 launchpad systemd[1]: Starting Server for local large language models...
+Mar 12 10:23:56 launchpad systemd[1]: Started Server for local large language models.
+Mar 12 10:23:56 launchpad ollama[1580]: 2025/03/12 10:23:56 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 12 10:23:56 launchpad ollama[1580]: time=2025-03-12T10:23:56.400-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 12 10:23:56 launchpad ollama[1580]: time=2025-03-12T10:23:56.405-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 12 10:23:56 launchpad ollama[1580]: time=2025-03-12T10:23:56.407-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 12 10:23:56 launchpad ollama[1580]: time=2025-03-12T10:23:56.411-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3159489295/runners
+Mar 12 10:23:59 launchpad ollama[1580]: time=2025-03-12T10:23:59.409-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Mar 12 10:23:59 launchpad ollama[1580]: time=2025-03-12T10:23:59.410-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 12 10:23:59 launchpad ollama[1580]: time=2025-03-12T10:23:59.410-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:23:59 launchpad ollama[1580]: time=2025-03-12T10:23:59.411-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:23:59 launchpad ollama[1580]: time=2025-03-12T10:23:59.411-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:23:59 launchpad ollama[1580]: time=2025-03-12T10:23:59.653-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 12 10:24:28 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 12 10:24:29 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 12 10:24:29 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 12 10:24:29 launchpad systemd[1]: ollama.service: Consumed 3.400s CPU time, 787.5M memory peak, 234.4M read from disk, 508.1M written to disk.
+-- Boot 40aecf6c345c46cd88d74fc846eed1f8 --
+Mar 12 10:25:07 launchpad systemd[1]: Starting Server for local large language models...
+Mar 12 10:25:07 launchpad systemd[1]: Started Server for local large language models.
+Mar 12 10:25:07 launchpad ollama[1576]: 2025/03/12 10:25:07 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 12 10:25:07 launchpad ollama[1576]: time=2025-03-12T10:25:07.456-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 12 10:25:07 launchpad ollama[1576]: time=2025-03-12T10:25:07.461-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 12 10:25:07 launchpad ollama[1576]: time=2025-03-12T10:25:07.462-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 12 10:25:07 launchpad ollama[1576]: time=2025-03-12T10:25:07.464-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1772389787/runners
+Mar 12 10:25:10 launchpad ollama[1576]: time=2025-03-12T10:25:10.454-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 12 10:25:10 launchpad ollama[1576]: time=2025-03-12T10:25:10.455-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 12 10:25:10 launchpad ollama[1576]: time=2025-03-12T10:25:10.455-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:25:10 launchpad ollama[1576]: time=2025-03-12T10:25:10.456-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:25:10 launchpad ollama[1576]: time=2025-03-12T10:25:10.456-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 10:25:10 launchpad ollama[1576]: time=2025-03-12T10:25:10.711-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 12 15:49:11 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 12 15:49:12 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 12 15:49:12 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 12 15:49:12 launchpad systemd[1]: ollama.service: Consumed 4.112s CPU time, 787.8M memory peak, 234.1M read from disk, 508.1M written to disk.
+Mar 12 15:49:17 launchpad systemd[1]: Starting Server for local large language models...
+Mar 12 15:49:18 launchpad systemd[1]: Started Server for local large language models.
+Mar 12 15:49:18 launchpad ollama[171362]: 2025/03/12 15:49:18 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 12 15:49:18 launchpad ollama[171362]: time=2025-03-12T15:49:18.055-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 12 15:49:18 launchpad ollama[171362]: time=2025-03-12T15:49:18.055-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 12 15:49:18 launchpad ollama[171362]: time=2025-03-12T15:49:18.055-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 12 15:49:18 launchpad ollama[171362]: time=2025-03-12T15:49:18.056-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2630945923/runners
+Mar 12 15:49:21 launchpad ollama[171362]: time=2025-03-12T15:49:21.141-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Mar 12 15:49:21 launchpad ollama[171362]: time=2025-03-12T15:49:21.142-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 12 15:49:21 launchpad ollama[171362]: time=2025-03-12T15:49:21.142-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:49:21 launchpad ollama[171362]: time=2025-03-12T15:49:21.142-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:49:21 launchpad ollama[171362]: time=2025-03-12T15:49:21.142-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:49:21 launchpad ollama[171362]: time=2025-03-12T15:49:21.373-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="8.9 GiB"
+Mar 12 15:54:24 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 12 15:54:24 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 12 15:54:24 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 12 15:54:24 launchpad systemd[1]: ollama.service: Consumed 3.556s CPU time, 552.4M memory peak, 508.1M written to disk.
+-- Boot 603fdba76a074e8aafa0425aa051a545 --
+Mar 12 15:55:06 launchpad systemd[1]: Starting Server for local large language models...
+Mar 12 15:55:06 launchpad systemd[1]: Started Server for local large language models.
+Mar 12 15:55:06 launchpad ollama[1582]: 2025/03/12 15:55:06 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 12 15:55:06 launchpad ollama[1582]: time=2025-03-12T15:55:06.198-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 12 15:55:06 launchpad ollama[1582]: time=2025-03-12T15:55:06.203-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 12 15:55:06 launchpad ollama[1582]: time=2025-03-12T15:55:06.204-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 12 15:55:06 launchpad ollama[1582]: time=2025-03-12T15:55:06.205-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama878283110/runners
+Mar 12 15:55:09 launchpad ollama[1582]: time=2025-03-12T15:55:09.184-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Mar 12 15:55:09 launchpad ollama[1582]: time=2025-03-12T15:55:09.185-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 12 15:55:09 launchpad ollama[1582]: time=2025-03-12T15:55:09.185-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:55:09 launchpad ollama[1582]: time=2025-03-12T15:55:09.185-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:55:09 launchpad ollama[1582]: time=2025-03-12T15:55:09.185-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:55:10 launchpad ollama[1582]: time=2025-03-12T15:55:10.952-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Mar 12 15:56:25 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 12 15:56:25 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 12 15:56:25 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 12 15:56:25 launchpad systemd[1]: ollama.service: Consumed 5.346s CPU time, 786.6M memory peak, 233.6M read from disk, 508.1M written to disk.
+-- Boot 0ea312c34cf84f328f700fef1b38b368 --
+Mar 12 15:57:00 launchpad systemd[1]: Starting Server for local large language models...
+Mar 12 15:57:00 launchpad systemd[1]: Started Server for local large language models.
+Mar 12 15:57:00 launchpad ollama[1583]: 2025/03/12 15:57:00 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 12 15:57:00 launchpad ollama[1583]: time=2025-03-12T15:57:00.992-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 12 15:57:00 launchpad ollama[1583]: time=2025-03-12T15:57:00.997-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 12 15:57:00 launchpad ollama[1583]: time=2025-03-12T15:57:00.998-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 12 15:57:01 launchpad ollama[1583]: time=2025-03-12T15:57:01.000-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1197272828/runners
+Mar 12 15:57:04 launchpad ollama[1583]: time=2025-03-12T15:57:04.015-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Mar 12 15:57:04 launchpad ollama[1583]: time=2025-03-12T15:57:04.015-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 12 15:57:04 launchpad ollama[1583]: time=2025-03-12T15:57:04.016-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:57:04 launchpad ollama[1583]: time=2025-03-12T15:57:04.016-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:57:04 launchpad ollama[1583]: time=2025-03-12T15:57:04.016-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 12 15:57:04 launchpad ollama[1583]: time=2025-03-12T15:57:04.257-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 13 11:00:04 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:00:04 | 200 |     597.248µs |       127.0.0.1 | HEAD     "/"
+Mar 13 11:00:04 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:00:04 | 200 |   16.375027ms |       127.0.0.1 | POST     "/api/show"
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.250-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9191292928 required="6.2 GiB"
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.250-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.251-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.253-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1197272828/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33525"
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.253-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.253-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.253-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 13 11:00:04 launchpad ollama[48847]: INFO [main] build info | build=0 commit="unknown" tid="140290989002752" timestamp=1741888804
+Mar 13 11:00:04 launchpad ollama[48847]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140290989002752" timestamp=1741888804 total_threads=16
+Mar 13 11:00:04 launchpad ollama[48847]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33525" tid="140290989002752" timestamp=1741888804
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - type  f32:   65 tensors
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - type q4_0:  225 tensors
+Mar 13 11:00:04 launchpad ollama[1583]: llama_model_loader: - type q6_K:    1 tensors
+Mar 13 11:00:04 launchpad ollama[1583]: time=2025-03-13T11:00:04.504-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_vocab: special tokens cache size = 256
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: arch             = llama
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: vocab type       = BPE
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_vocab          = 128256
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_merges         = 280147
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: vocab_only       = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_embd           = 4096
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_layer          = 32
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_head           = 32
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_head_kv        = 8
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_rot            = 128
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_swa            = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_gqa            = 4
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_ff             = 14336
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_expert         = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_expert_used    = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: causal attn      = 1
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: pooling type     = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: rope type        = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: rope scaling     = linear
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: freq_scale_train = 1
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: ssm_d_state      = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: model type       = 8B
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: model ftype      = Q4_0
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: model params     = 8.03 B
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_print_meta: max token length = 256
+Mar 13 11:00:04 launchpad ollama[1583]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 13 11:00:04 launchpad ollama[1583]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 13 11:00:04 launchpad ollama[1583]: ggml_cuda_init: found 1 CUDA devices:
+Mar 13 11:00:04 launchpad ollama[1583]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 13 11:00:04 launchpad ollama[1583]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 13 11:00:09 launchpad ollama[1583]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 13 11:00:09 launchpad ollama[1583]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 13 11:00:09 launchpad ollama[1583]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 13 11:00:09 launchpad ollama[1583]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 13 11:00:09 launchpad ollama[1583]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: n_ctx      = 8192
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: n_batch    = 512
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: n_ubatch   = 512
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: flash_attn = 0
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: freq_scale = 1
+Mar 13 11:00:10 launchpad ollama[1583]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: graph nodes  = 1030
+Mar 13 11:00:10 launchpad ollama[1583]: llama_new_context_with_model: graph splits = 2
+Mar 13 11:00:10 launchpad ollama[48847]: INFO [main] model loaded | tid="140290989002752" timestamp=1741888810
+Mar 13 11:00:10 launchpad ollama[1583]: time=2025-03-13T11:00:10.524-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Mar 13 11:00:10 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:00:10 | 200 |  6.457434451s |       127.0.0.1 | POST     "/api/generate"
+Mar 13 11:01:45 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:01:45 | 200 |  8.931576314s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 11:05:43 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:05:43 | 200 |    7.4592717s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 11:07:19 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:07:19 | 200 |  7.940772107s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.000-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9195487232 required="6.2 GiB"
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.000-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.001-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.002-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1197272828/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36807"
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.002-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.002-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.002-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 13 11:16:54 launchpad ollama[51210]: INFO [main] build info | build=0 commit="unknown" tid="139961012674560" timestamp=1741889814
+Mar 13 11:16:54 launchpad ollama[51210]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139961012674560" timestamp=1741889814 total_threads=16
+Mar 13 11:16:54 launchpad ollama[51210]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36807" tid="139961012674560" timestamp=1741889814
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - type  f32:   65 tensors
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - type q4_0:  225 tensors
+Mar 13 11:16:54 launchpad ollama[1583]: llama_model_loader: - type q6_K:    1 tensors
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_vocab: special tokens cache size = 256
+Mar 13 11:16:54 launchpad ollama[1583]: time=2025-03-13T11:16:54.253-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: arch             = llama
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: vocab type       = BPE
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_vocab          = 128256
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_merges         = 280147
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: vocab_only       = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_embd           = 4096
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_layer          = 32
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_head           = 32
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_head_kv        = 8
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_rot            = 128
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_swa            = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_gqa            = 4
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_ff             = 14336
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_expert         = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_expert_used    = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: causal attn      = 1
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: pooling type     = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: rope type        = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: rope scaling     = linear
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: freq_scale_train = 1
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: ssm_d_state      = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: model type       = 8B
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: model ftype      = Q4_0
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: model params     = 8.03 B
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_print_meta: max token length = 256
+Mar 13 11:16:54 launchpad ollama[1583]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 13 11:16:54 launchpad ollama[1583]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 13 11:16:54 launchpad ollama[1583]: ggml_cuda_init: found 1 CUDA devices:
+Mar 13 11:16:54 launchpad ollama[1583]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: n_ctx      = 8192
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: n_batch    = 512
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: n_ubatch   = 512
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: flash_attn = 0
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: freq_scale = 1
+Mar 13 11:16:54 launchpad ollama[1583]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: graph nodes  = 1030
+Mar 13 11:16:54 launchpad ollama[1583]: llama_new_context_with_model: graph splits = 2
+Mar 13 11:16:55 launchpad ollama[51210]: INFO [main] model loaded | tid="139961012674560" timestamp=1741889815
+Mar 13 11:16:55 launchpad ollama[1583]: time=2025-03-13T11:16:55.258-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 13 11:17:04 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:17:04 | 200 | 10.685317637s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.089-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9195487232 required="6.2 GiB"
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.089-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.089-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.090-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1197272828/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34067"
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.090-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.090-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.091-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 13 11:24:32 launchpad ollama[52283]: INFO [main] build info | build=0 commit="unknown" tid="140571679039488" timestamp=1741890272
+Mar 13 11:24:32 launchpad ollama[52283]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140571679039488" timestamp=1741890272 total_threads=16
+Mar 13 11:24:32 launchpad ollama[52283]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34067" tid="140571679039488" timestamp=1741890272
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - type  f32:   65 tensors
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - type q4_0:  225 tensors
+Mar 13 11:24:32 launchpad ollama[1583]: llama_model_loader: - type q6_K:    1 tensors
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_vocab: special tokens cache size = 256
+Mar 13 11:24:32 launchpad ollama[1583]: time=2025-03-13T11:24:32.341-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: arch             = llama
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: vocab type       = BPE
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_vocab          = 128256
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_merges         = 280147
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: vocab_only       = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_embd           = 4096
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_layer          = 32
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_head           = 32
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_head_kv        = 8
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_rot            = 128
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_swa            = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_gqa            = 4
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_ff             = 14336
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_expert         = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_expert_used    = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: causal attn      = 1
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: pooling type     = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: rope type        = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: rope scaling     = linear
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: freq_scale_train = 1
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: ssm_d_state      = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: model type       = 8B
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: model ftype      = Q4_0
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: model params     = 8.03 B
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_print_meta: max token length = 256
+Mar 13 11:24:32 launchpad ollama[1583]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 13 11:24:32 launchpad ollama[1583]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 13 11:24:32 launchpad ollama[1583]: ggml_cuda_init: found 1 CUDA devices:
+Mar 13 11:24:32 launchpad ollama[1583]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 13 11:24:32 launchpad ollama[1583]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: n_ctx      = 8192
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: n_batch    = 512
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: n_ubatch   = 512
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: flash_attn = 0
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: freq_scale = 1
+Mar 13 11:24:33 launchpad ollama[1583]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: graph nodes  = 1030
+Mar 13 11:24:33 launchpad ollama[1583]: llama_new_context_with_model: graph splits = 2
+Mar 13 11:24:33 launchpad ollama[52283]: INFO [main] model loaded | tid="140571679039488" timestamp=1741890273
+Mar 13 11:24:33 launchpad ollama[1583]: time=2025-03-13T11:24:33.094-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 13 11:24:42 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:24:42 | 200 | 10.199471089s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 11:28:28 launchpad ollama[1583]: [GIN] 2025/03/13 - 11:28:28 | 200 |   5.64483084s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 11:29:05 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 11:29:06 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 11:29:06 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 11:29:06 launchpad systemd[1]: ollama.service: Consumed 58.161s CPU time, 5.5G memory peak, 4.6G read from disk, 508.1M written to disk, 932.9K incoming IP traffic, 1.1M outgoing IP traffic.
+-- Boot c1f31604bee640999d055f7bd130d882 --
+Mar 13 11:31:06 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 11:31:06 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 11:31:06 launchpad ollama[1580]: 2025/03/13 11:31:06 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 11:31:06 launchpad ollama[1580]: time=2025-03-13T11:31:06.372-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 11:31:06 launchpad ollama[1580]: time=2025-03-13T11:31:06.377-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 11:31:06 launchpad ollama[1580]: time=2025-03-13T11:31:06.378-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 11:31:06 launchpad ollama[1580]: time=2025-03-13T11:31:06.381-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1311511704/runners
+Mar 13 11:31:09 launchpad ollama[1580]: time=2025-03-13T11:31:09.377-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 11:31:09 launchpad ollama[1580]: time=2025-03-13T11:31:09.377-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 11:31:09 launchpad ollama[1580]: time=2025-03-13T11:31:09.377-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 11:31:09 launchpad ollama[1580]: time=2025-03-13T11:31:09.377-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 11:31:09 launchpad ollama[1580]: time=2025-03-13T11:31:09.377-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 11:31:09 launchpad ollama[1580]: time=2025-03-13T11:31:09.620-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 13 11:35:54 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 11:35:54 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 11:35:54 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 11:35:54 launchpad systemd[1]: ollama.service: Consumed 3.420s CPU time, 786.7M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot 3acb00d439a341faad90f7e8bfd12807 --
+Mar 13 11:36:26 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 11:36:26 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 11:36:26 launchpad ollama[1579]: 2025/03/13 11:36:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 11:36:26 launchpad ollama[1579]: time=2025-03-13T11:36:26.740-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 11:36:26 launchpad ollama[1579]: time=2025-03-13T11:36:26.746-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 11:36:26 launchpad ollama[1579]: time=2025-03-13T11:36:26.747-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 11:36:26 launchpad ollama[1579]: time=2025-03-13T11:36:26.748-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1006856464/runners
+Mar 13 11:36:29 launchpad ollama[1579]: time=2025-03-13T11:36:29.757-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 11:36:29 launchpad ollama[1579]: time=2025-03-13T11:36:29.758-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 11:36:29 launchpad ollama[1579]: time=2025-03-13T11:36:29.758-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 11:36:29 launchpad ollama[1579]: time=2025-03-13T11:36:29.758-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 11:36:29 launchpad ollama[1579]: time=2025-03-13T11:36:29.758-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 11:36:29 launchpad ollama[1579]: time=2025-03-13T11:36:29.996-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Mar 13 12:06:05 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 12:06:05 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 12:06:05 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 12:06:05 launchpad systemd[1]: ollama.service: Consumed 3.474s CPU time, 787.1M memory peak, 233.6M read from disk, 508.1M written to disk.
+-- Boot aee0d4f29c7e48a8833a7dec2be986b1 --
+Mar 13 12:06:37 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 12:06:37 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 12:06:38 launchpad ollama[1534]: 2025/03/13 12:06:38 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 12:06:38 launchpad ollama[1534]: time=2025-03-13T12:06:38.073-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 12:06:38 launchpad ollama[1534]: time=2025-03-13T12:06:38.078-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 12:06:38 launchpad ollama[1534]: time=2025-03-13T12:06:38.079-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 12:06:38 launchpad ollama[1534]: time=2025-03-13T12:06:38.080-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2809634376/runners
+Mar 13 12:06:41 launchpad ollama[1534]: time=2025-03-13T12:06:41.062-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 12:06:41 launchpad ollama[1534]: time=2025-03-13T12:06:41.063-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 12:06:41 launchpad ollama[1534]: time=2025-03-13T12:06:41.063-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:06:41 launchpad ollama[1534]: time=2025-03-13T12:06:41.064-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:06:41 launchpad ollama[1534]: time=2025-03-13T12:06:41.064-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:06:41 launchpad ollama[1534]: time=2025-03-13T12:06:41.279-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 13 12:07:34 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 12:07:34 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 12:07:34 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 12:07:34 launchpad systemd[1]: ollama.service: Consumed 3.412s CPU time, 786.5M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot baf5c96d122a45f295e18e1a02e3d587 --
+Mar 13 12:08:12 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 12:08:12 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 12:08:12 launchpad ollama[1534]: 2025/03/13 12:08:12 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 12:08:12 launchpad ollama[1534]: time=2025-03-13T12:08:12.695-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 12:08:12 launchpad ollama[1534]: time=2025-03-13T12:08:12.701-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 12:08:12 launchpad ollama[1534]: time=2025-03-13T12:08:12.702-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 12:08:12 launchpad ollama[1534]: time=2025-03-13T12:08:12.703-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4269305187/runners
+Mar 13 12:08:15 launchpad ollama[1534]: time=2025-03-13T12:08:15.601-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 12:08:15 launchpad ollama[1534]: time=2025-03-13T12:08:15.602-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 12:08:15 launchpad ollama[1534]: time=2025-03-13T12:08:15.602-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:08:15 launchpad ollama[1534]: time=2025-03-13T12:08:15.603-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:08:15 launchpad ollama[1534]: time=2025-03-13T12:08:15.603-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:08:17 launchpad ollama[1534]: time=2025-03-13T12:08:17.355-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Mar 13 12:08:34 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 12:08:35 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 12:08:35 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 12:08:35 launchpad systemd[1]: ollama.service: Consumed 5.225s CPU time, 786.9M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 157366d5108c4d06a7edb4c7cbaab0cf --
+Mar 13 12:09:05 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 12:09:05 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 12:09:05 launchpad ollama[1530]: 2025/03/13 12:09:05 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 12:09:05 launchpad ollama[1530]: time=2025-03-13T12:09:05.839-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 12:09:05 launchpad ollama[1530]: time=2025-03-13T12:09:05.844-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 12:09:05 launchpad ollama[1530]: time=2025-03-13T12:09:05.845-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 12:09:05 launchpad ollama[1530]: time=2025-03-13T12:09:05.847-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2657318819/runners
+Mar 13 12:09:08 launchpad ollama[1530]: time=2025-03-13T12:09:08.813-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Mar 13 12:09:08 launchpad ollama[1530]: time=2025-03-13T12:09:08.813-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 12:09:08 launchpad ollama[1530]: time=2025-03-13T12:09:08.814-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:09:08 launchpad ollama[1530]: time=2025-03-13T12:09:08.814-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:09:08 launchpad ollama[1530]: time=2025-03-13T12:09:08.814-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:09:09 launchpad ollama[1530]: time=2025-03-13T12:09:09.034-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 13 12:11:19 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 12:11:19 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 12:11:19 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 12:11:19 launchpad systemd[1]: ollama.service: Consumed 3.429s CPU time, 786.6M memory peak, 233.7M read from disk, 508.1M written to disk.
+-- Boot 5dbee777929942a9be95deb1417c3c46 --
+Mar 13 12:12:23 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 12:12:24 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 12:12:24 launchpad ollama[1530]: 2025/03/13 12:12:24 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 12:12:24 launchpad ollama[1530]: time=2025-03-13T12:12:24.109-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 12:12:24 launchpad ollama[1530]: time=2025-03-13T12:12:24.114-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 12:12:24 launchpad ollama[1530]: time=2025-03-13T12:12:24.115-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 12:12:24 launchpad ollama[1530]: time=2025-03-13T12:12:24.117-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2006416511/runners
+Mar 13 12:12:27 launchpad ollama[1530]: time=2025-03-13T12:12:27.101-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 12:12:27 launchpad ollama[1530]: time=2025-03-13T12:12:27.101-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 12:12:27 launchpad ollama[1530]: time=2025-03-13T12:12:27.101-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:12:27 launchpad ollama[1530]: time=2025-03-13T12:12:27.101-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:12:27 launchpad ollama[1530]: time=2025-03-13T12:12:27.101-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:12:27 launchpad ollama[1530]: time=2025-03-13T12:12:27.294-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 13 12:24:12 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 12:24:12 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 12:24:12 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 12:24:12 launchpad systemd[1]: ollama.service: Consumed 3.419s CPU time, 786.5M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot 9892e1de7a774876a48ebed024abc84d --
+Mar 13 12:24:44 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 12:24:45 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 12:24:45 launchpad ollama[1527]: 2025/03/13 12:24:45 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 12:24:45 launchpad ollama[1527]: time=2025-03-13T12:24:45.131-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 12:24:45 launchpad ollama[1527]: time=2025-03-13T12:24:45.136-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 12:24:45 launchpad ollama[1527]: time=2025-03-13T12:24:45.137-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 12:24:45 launchpad ollama[1527]: time=2025-03-13T12:24:45.138-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4001498712/runners
+Mar 13 12:24:48 launchpad ollama[1527]: time=2025-03-13T12:24:48.128-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 12:24:48 launchpad ollama[1527]: time=2025-03-13T12:24:48.128-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 12:24:48 launchpad ollama[1527]: time=2025-03-13T12:24:48.128-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:24:48 launchpad ollama[1527]: time=2025-03-13T12:24:48.129-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:24:48 launchpad ollama[1527]: time=2025-03-13T12:24:48.129-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 12:24:48 launchpad ollama[1527]: time=2025-03-13T12:24:48.327-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 13 14:30:09 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 14:30:09 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 14:30:09 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 14:30:09 launchpad systemd[1]: ollama.service: Consumed 3.514s CPU time, 786.7M memory peak, 233.7M read from disk, 508.1M written to disk.
+-- Boot 39d8f516509c4d6494bf51d29f68080d --
+Mar 13 14:30:51 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 14:30:51 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 14:30:51 launchpad ollama[1541]: 2025/03/13 14:30:51 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 14:30:51 launchpad ollama[1541]: time=2025-03-13T14:30:51.995-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 14:30:52 launchpad ollama[1541]: time=2025-03-13T14:30:52.000-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 14:30:52 launchpad ollama[1541]: time=2025-03-13T14:30:52.001-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 14:30:52 launchpad ollama[1541]: time=2025-03-13T14:30:52.002-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1028367990/runners
+Mar 13 14:30:54 launchpad ollama[1541]: time=2025-03-13T14:30:54.842-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Mar 13 14:30:54 launchpad ollama[1541]: time=2025-03-13T14:30:54.843-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 14:30:54 launchpad ollama[1541]: time=2025-03-13T14:30:54.843-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 14:30:54 launchpad ollama[1541]: time=2025-03-13T14:30:54.844-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 14:30:54 launchpad ollama[1541]: time=2025-03-13T14:30:54.844-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 14:30:56 launchpad ollama[1541]: time=2025-03-13T14:30:56.553-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Mar 13 14:31:29 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 13 14:31:29 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 13 14:31:29 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 13 14:31:29 launchpad systemd[1]: ollama.service: Consumed 5.153s CPU time, 786.9M memory peak, 233.9M read from disk, 508.1M written to disk.
+-- Boot 071f7081c98f4fdcaa6c2adb0b749244 --
+Mar 13 14:32:00 launchpad systemd[1]: Starting Server for local large language models...
+Mar 13 14:32:00 launchpad systemd[1]: Started Server for local large language models.
+Mar 13 14:32:00 launchpad ollama[1553]: 2025/03/13 14:32:00 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 13 14:32:00 launchpad ollama[1553]: time=2025-03-13T14:32:00.916-07:00 level=INFO source=images.go:753 msg="total blobs: 24"
+Mar 13 14:32:00 launchpad ollama[1553]: time=2025-03-13T14:32:00.921-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 13 14:32:00 launchpad ollama[1553]: time=2025-03-13T14:32:00.922-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 13 14:32:00 launchpad ollama[1553]: time=2025-03-13T14:32:00.924-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama352866884/runners
+Mar 13 14:32:03 launchpad ollama[1553]: time=2025-03-13T14:32:03.855-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 13 14:32:03 launchpad ollama[1553]: time=2025-03-13T14:32:03.856-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 13 14:32:03 launchpad ollama[1553]: time=2025-03-13T14:32:03.856-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 14:32:03 launchpad ollama[1553]: time=2025-03-13T14:32:03.856-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 14:32:03 launchpad ollama[1553]: time=2025-03-13T14:32:03.856-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 13 14:32:04 launchpad ollama[1553]: time=2025-03-13T14:32:04.065-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 13 16:17:43 launchpad ollama[1553]: [GIN] 2025/03/13 - 16:17:43 | 200 |     557.539µs |       127.0.0.1 | HEAD     "/"
+Mar 13 16:17:43 launchpad ollama[1553]: [GIN] 2025/03/13 - 16:17:43 | 200 |    6.958071ms |       127.0.0.1 | POST     "/api/show"
+Mar 13 16:17:43 launchpad ollama[1553]: time=2025-03-13T16:17:43.906-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.058-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10023403520 required="9.2 GiB"
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.058-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.058-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.059-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 41581"
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.059-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.059-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.060-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 13 16:17:44 launchpad ollama[23651]: INFO [main] build info | build=0 commit="unknown" tid="140323271462912" timestamp=1741907864
+Mar 13 16:17:44 launchpad ollama[23651]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140323271462912" timestamp=1741907864 total_threads=16
+Mar 13 16:17:44 launchpad ollama[23651]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41581" tid="140323271462912" timestamp=1741907864
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 13 16:17:44 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 13 16:17:44 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 13 16:17:44 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 13 16:17:44 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 13 16:17:44 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 13 16:17:44 launchpad ollama[1553]: time=2025-03-13T16:17:44.311-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 13 16:17:44 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llm_load_tensors: offloading 40 repeating layers to GPU
+Mar 13 16:17:52 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 13 16:17:52 launchpad ollama[1553]: llm_load_tensors: offloaded 41/41 layers to GPU
+Mar 13 16:17:52 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 13 16:17:52 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 13 16:17:52 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 13 16:17:52 launchpad ollama[23651]: INFO [main] model loaded | tid="140323271462912" timestamp=1741907872
+Mar 13 16:17:53 launchpad ollama[1553]: time=2025-03-13T16:17:53.085-07:00 level=INFO source=server.go:626 msg="llama runner started in 9.03 seconds"
+Mar 13 16:17:53 launchpad ollama[1553]: [GIN] 2025/03/13 - 16:17:53 | 200 |  9.182304682s |       127.0.0.1 | POST     "/api/generate"
+Mar 13 16:19:48 launchpad ollama[1553]: time=2025-03-13T16:19:48.407-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 13 16:19:58 launchpad ollama[1553]: [GIN] 2025/03/13 - 16:19:58 | 200 |  9.720090366s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 17:20:12 launchpad ollama[1553]: time=2025-03-13T17:20:12.887-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.038-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9956425728 required="9.2 GiB"
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.038-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.0 GiB" free_swap="68.9 GiB"
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.039-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.040-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 42287"
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.040-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.040-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.040-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 13 17:20:13 launchpad ollama[34476]: INFO [main] build info | build=0 commit="unknown" tid="139751712792576" timestamp=1741911613
+Mar 13 17:20:13 launchpad ollama[34476]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139751712792576" timestamp=1741911613 total_threads=16
+Mar 13 17:20:13 launchpad ollama[34476]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42287" tid="139751712792576" timestamp=1741911613
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 13 17:20:13 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 13 17:20:13 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 13 17:20:13 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 13 17:20:13 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 13 17:20:13 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 13 17:20:13 launchpad ollama[1553]: time=2025-03-13T17:20:13.331-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_tensors: offloading 40 repeating layers to GPU
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_tensors: offloaded 41/41 layers to GPU
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Mar 13 17:20:13 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 13 17:20:14 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 13 17:20:14 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 13 17:20:14 launchpad ollama[34476]: INFO [main] model loaded | tid="139751712792576" timestamp=1741911614
+Mar 13 17:20:14 launchpad ollama[1553]: time=2025-03-13T17:20:14.334-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Mar 13 17:20:29 launchpad ollama[1553]: [GIN] 2025/03/13 - 17:20:29 | 200 | 16.673622598s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 17:21:51 launchpad ollama[1553]: time=2025-03-13T17:21:51.661-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 13 17:21:57 launchpad ollama[1553]: [GIN] 2025/03/13 - 17:21:57 | 200 |  5.496144275s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.221-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.376-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9958457344 required="9.2 GiB"
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.376-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.0 GiB" free_swap="68.9 GiB"
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.376-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.377-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 38611"
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.377-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.377-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.378-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 13 17:37:50 launchpad ollama[38232]: INFO [main] build info | build=0 commit="unknown" tid="139831091875840" timestamp=1741912670
+Mar 13 17:37:50 launchpad ollama[38232]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139831091875840" timestamp=1741912670 total_threads=16
+Mar 13 17:37:50 launchpad ollama[38232]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38611" tid="139831091875840" timestamp=1741912670
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 13 17:37:50 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 13 17:37:50 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 13 17:37:50 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 13 17:37:50 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 13 17:37:50 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 13 17:37:50 launchpad ollama[1553]: time=2025-03-13T17:37:50.667-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_tensors: offloading 40 repeating layers to GPU
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_tensors: offloaded 41/41 layers to GPU
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Mar 13 17:37:50 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 13 17:37:51 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 13 17:37:51 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 13 17:37:51 launchpad ollama[38232]: INFO [main] model loaded | tid="139831091875840" timestamp=1741912671
+Mar 13 17:37:51 launchpad ollama[1553]: time=2025-03-13T17:37:51.671-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Mar 13 17:38:11 launchpad ollama[1553]: [GIN] 2025/03/13 - 17:38:11 | 200 |  20.86947765s |       127.0.0.1 | POST     "/api/chat"
+Mar 13 17:39:02 launchpad ollama[1553]: time=2025-03-13T17:39:02.693-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 13 17:39:13 launchpad ollama[1553]: [GIN] 2025/03/13 - 17:39:13 | 200 | 11.067425774s |       127.0.0.1 | POST     "/api/chat"
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.689-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.851-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.8 GiB" free_swap="68.9 GiB"
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.852-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=40 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.1 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.1 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.853-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 40 --parallel 1 --port 40861"
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.853-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.853-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 14 10:33:03 launchpad ollama[1553]: time=2025-03-14T10:33:03.853-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 14 10:33:03 launchpad ollama[81310]: INFO [main] build info | build=0 commit="unknown" tid="139884910071808" timestamp=1741973583
+Mar 14 10:33:03 launchpad ollama[81310]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139884910071808" timestamp=1741973583 total_threads=16
+Mar 14 10:33:03 launchpad ollama[81310]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40861" tid="139884910071808" timestamp=1741973583
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 14 10:33:03 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 14 10:33:03 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 14 10:33:03 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 14 10:33:03 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 14 10:33:03 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 14 10:33:03 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: time=2025-03-14T10:33:04.137-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 14 10:33:04 launchpad ollama[1553]: llm_load_tensors: offloading 40 repeating layers to GPU
+Mar 14 10:33:04 launchpad ollama[1553]: llm_load_tensors: offloaded 40/41 layers to GPU
+Mar 14 10:33:04 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6807.81 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 14 10:33:04 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 14 10:33:04 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 4
+Mar 14 10:33:04 launchpad ollama[81310]: INFO [main] model loaded | tid="139884910071808" timestamp=1741973584
+Mar 14 10:33:05 launchpad ollama[1553]: time=2025-03-14T10:33:05.141-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Mar 14 10:33:19 launchpad ollama[1553]: [GIN] 2025/03/14 - 10:33:19 | 200 | 15.442020204s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 14:11:55 launchpad ollama[1553]: [GIN] 2025/03/15 - 14:11:55 | 200 |      14.236µs |       127.0.0.1 | HEAD     "/"
+Mar 15 14:11:55 launchpad ollama[1553]: [GIN] 2025/03/15 - 14:11:55 | 200 |    3.460728ms |       127.0.0.1 | POST     "/api/show"
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.543-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.704-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.5 GiB" free_swap="68.9 GiB"
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.704-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.706-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 37509"
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.706-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.706-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.706-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 15 14:11:55 launchpad ollama[197540]: INFO [main] build info | build=0 commit="unknown" tid="140427337490432" timestamp=1742073115
+Mar 15 14:11:55 launchpad ollama[197540]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140427337490432" timestamp=1742073115 total_threads=16
+Mar 15 14:11:55 launchpad ollama[197540]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37509" tid="140427337490432" timestamp=1742073115
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 15 14:11:55 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 15 14:11:55 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 15 14:11:55 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 15 14:11:55 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 15 14:11:55 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 15 14:11:55 launchpad ollama[1553]: time=2025-03-15T14:11:55.992-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_tensors: offloading 38 repeating layers to GPU
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_tensors: offloaded 38/41 layers to GPU
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Mar 15 14:11:55 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 15 14:11:56 launchpad ollama[1553]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 15 14:11:56 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 26
+Mar 15 14:11:56 launchpad ollama[197540]: INFO [main] model loaded | tid="140427337490432" timestamp=1742073116
+Mar 15 14:11:56 launchpad ollama[1553]: time=2025-03-15T14:11:56.996-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Mar 15 14:11:56 launchpad ollama[1553]: [GIN] 2025/03/15 - 14:11:56 | 200 |  1.455882801s |       127.0.0.1 | POST     "/api/generate"
+Mar 15 14:15:38 launchpad ollama[1553]: time=2025-03-15T14:15:38.730-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 14:15:57 launchpad ollama[1553]: [GIN] 2025/03/15 - 14:15:57 | 200 | 19.213770804s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 14:16:25 launchpad ollama[1553]: time=2025-03-15T14:16:25.081-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 14:16:47 launchpad ollama[1553]: [GIN] 2025/03/15 - 14:16:47 | 200 | 22.249997596s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.422-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.580-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.3 GiB" free_swap="68.9 GiB"
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.581-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.582-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 44303"
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.582-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.582-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.582-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 15 14:56:43 launchpad ollama[235944]: INFO [main] build info | build=0 commit="unknown" tid="140708567494656" timestamp=1742075803
+Mar 15 14:56:43 launchpad ollama[235944]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140708567494656" timestamp=1742075803 total_threads=16
+Mar 15 14:56:43 launchpad ollama[235944]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44303" tid="140708567494656" timestamp=1742075803
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 15 14:56:43 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 15 14:56:43 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 15 14:56:43 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 15 14:56:43 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 15 14:56:43 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 15 14:56:43 launchpad ollama[1553]: time=2025-03-15T14:56:43.870-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_tensors: offloading 38 repeating layers to GPU
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_tensors: offloaded 38/41 layers to GPU
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Mar 15 14:56:43 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 15 14:56:44 launchpad ollama[1553]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 15 14:56:44 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 26
+Mar 15 14:56:44 launchpad ollama[235944]: INFO [main] model loaded | tid="140708567494656" timestamp=1742075804
+Mar 15 14:56:44 launchpad ollama[1553]: time=2025-03-15T14:56:44.874-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Mar 15 14:56:57 launchpad ollama[1553]: [GIN] 2025/03/15 - 14:56:57 | 200 | 14.438564166s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.516-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.673-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.3 GiB" free_swap="68.9 GiB"
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.674-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.675-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 42201"
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.675-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.675-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.675-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 15 15:08:49 launchpad ollama[261636]: INFO [main] build info | build=0 commit="unknown" tid="139798432841728" timestamp=1742076529
+Mar 15 15:08:49 launchpad ollama[261636]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139798432841728" timestamp=1742076529 total_threads=16
+Mar 15 15:08:49 launchpad ollama[261636]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42201" tid="139798432841728" timestamp=1742076529
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 15 15:08:49 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 15 15:08:49 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 15 15:08:49 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 15 15:08:49 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 15 15:08:49 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 15 15:08:49 launchpad ollama[1553]: time=2025-03-15T15:08:49.960-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_tensors: offloading 38 repeating layers to GPU
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_tensors: offloaded 38/41 layers to GPU
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Mar 15 15:08:49 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 15 15:08:50 launchpad ollama[1553]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 15 15:08:50 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 26
+Mar 15 15:08:50 launchpad ollama[261636]: INFO [main] model loaded | tid="139798432841728" timestamp=1742076530
+Mar 15 15:08:50 launchpad ollama[1553]: time=2025-03-15T15:08:50.964-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.29 seconds"
+Mar 15 15:09:12 launchpad ollama[1553]: [GIN] 2025/03/15 - 15:09:12 | 200 | 22.680878508s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.593-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.750-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="50.8 GiB" free_swap="68.9 GiB"
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.750-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.751-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 38893"
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.751-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.751-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 15 18:14:32 launchpad ollama[1553]: time=2025-03-15T18:14:32.751-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 15 18:14:32 launchpad ollama[976378]: INFO [main] build info | build=0 commit="unknown" tid="140515044581376" timestamp=1742087672
+Mar 15 18:14:32 launchpad ollama[976378]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140515044581376" timestamp=1742087672 total_threads=16
+Mar 15 18:14:32 launchpad ollama[976378]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38893" tid="140515044581376" timestamp=1742087672
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 15 18:14:32 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 15 18:14:32 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 15 18:14:32 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 15 18:14:32 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 15 18:14:32 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 15 18:14:32 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: time=2025-03-15T18:14:33.053-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 15 18:14:33 launchpad ollama[1553]: llm_load_tensors: offloading 38 repeating layers to GPU
+Mar 15 18:14:33 launchpad ollama[1553]: llm_load_tensors: offloaded 38/41 layers to GPU
+Mar 15 18:14:33 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 15 18:14:33 launchpad ollama[1553]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 15 18:14:33 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 26
+Mar 15 18:14:33 launchpad ollama[976378]: INFO [main] model loaded | tid="140515044581376" timestamp=1742087673
+Mar 15 18:14:34 launchpad ollama[1553]: time=2025-03-15T18:14:34.056-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Mar 15 18:14:48 launchpad ollama[1553]: [GIN] 2025/03/15 - 18:14:48 | 200 | 15.625807839s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 18:16:42 launchpad ollama[1553]: time=2025-03-15T18:16:42.938-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 18:16:51 launchpad ollama[1553]: [GIN] 2025/03/15 - 18:16:51 | 200 |  8.085941694s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 18:19:54 launchpad ollama[1553]: time=2025-03-15T18:19:54.708-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 18:20:11 launchpad ollama[1553]: [GIN] 2025/03/15 - 18:20:11 | 200 | 16.835525025s |       127.0.0.1 | POST     "/api/chat"
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.266-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.417-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="50.8 GiB" free_swap="68.9 GiB"
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.417-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.418-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 41137"
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.418-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.418-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.418-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 15 18:37:59 launchpad ollama[988720]: INFO [main] build info | build=0 commit="unknown" tid="140524964081664" timestamp=1742089079
+Mar 15 18:37:59 launchpad ollama[988720]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140524964081664" timestamp=1742089079 total_threads=16
+Mar 15 18:37:59 launchpad ollama[988720]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41137" tid="140524964081664" timestamp=1742089079
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - type  f32:   81 tensors
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - type q4_0:  281 tensors
+Mar 15 18:37:59 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 3
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V2
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: vocab type       = SPM
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 32016
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 16384
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 5120
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 40
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_head           = 40
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 40
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 1
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 13824
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 1000000.0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: model type       = 13B
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: model params     = 13.02 B
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: general.name     = codellama
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 1 ''
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 2 ''
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: UNK token        = 0 ''
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: SUF token        = 32008 '▁'
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: MID token        = 32009 '▁'
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 32010 '▁'
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_print_meta: max token length = 48
+Mar 15 18:37:59 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 15 18:37:59 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 15 18:37:59 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 15 18:37:59 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Mar 15 18:37:59 launchpad ollama[1553]: time=2025-03-15T18:37:59.711-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_tensors: offloading 38 repeating layers to GPU
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_tensors: offloaded 38/41 layers to GPU
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Mar 15 18:37:59 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 2048
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 1000000.0
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 15 18:38:00 launchpad ollama[1553]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1286
+Mar 15 18:38:00 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 26
+Mar 15 18:38:00 launchpad ollama[988720]: INFO [main] model loaded | tid="140524964081664" timestamp=1742089080
+Mar 15 18:38:00 launchpad ollama[1553]: time=2025-03-15T18:38:00.715-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Mar 15 18:38:19 launchpad ollama[1553]: [GIN] 2025/03/15 - 18:38:19 | 200 | 20.309878338s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:01:08 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:01:08 | 200 |       21.15µs |       127.0.0.1 | HEAD     "/"
+Mar 19 14:01:08 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:01:08 | 200 |   17.330488ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.770-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9252044800 required="6.2 GiB"
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.770-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.770-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.772-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46863"
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.772-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.772-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 14:01:08 launchpad ollama[1553]: time=2025-03-19T14:01:08.772-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 14:01:08 launchpad ollama[1061925]: INFO [main] build info | build=0 commit="unknown" tid="140604567769088" timestamp=1742418068
+Mar 19 14:01:08 launchpad ollama[1061925]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140604567769088" timestamp=1742418068 total_threads=16
+Mar 19 14:01:08 launchpad ollama[1061925]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46863" tid="140604567769088" timestamp=1742418068
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 14:01:08 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 14:01:09 launchpad ollama[1553]: time=2025-03-19T14:01:09.023-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 14:01:09 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 14:01:09 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 14:01:09 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 14:01:09 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 14:01:09 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 14:01:14 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 14:01:14 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 14:01:14 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 8192
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 14:01:14 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 14:01:14 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 14:01:14 launchpad ollama[1061925]: INFO [main] model loaded | tid="140604567769088" timestamp=1742418074
+Mar 19 14:01:14 launchpad ollama[1553]: time=2025-03-19T14:01:14.787-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.02 seconds"
+Mar 19 14:01:14 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:01:14 | 200 |  6.210392007s |       127.0.0.1 | POST     "/api/generate"
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.010-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9287958528 required="6.2 GiB"
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.010-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.010-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.011-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35981"
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.011-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.011-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.011-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 14:07:58 launchpad ollama[1062918]: INFO [main] build info | build=0 commit="unknown" tid="140374145241088" timestamp=1742418478
+Mar 19 14:07:58 launchpad ollama[1062918]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140374145241088" timestamp=1742418478 total_threads=16
+Mar 19 14:07:58 launchpad ollama[1062918]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35981" tid="140374145241088" timestamp=1742418478
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 14:07:58 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 14:07:58 launchpad ollama[1553]: time=2025-03-19T14:07:58.262-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 14:07:58 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 14:07:58 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 14:07:58 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 14:07:58 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 8192
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 14:07:58 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 14:07:58 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 14:07:59 launchpad ollama[1062918]: INFO [main] model loaded | tid="140374145241088" timestamp=1742418479
+Mar 19 14:07:59 launchpad ollama[1553]: time=2025-03-19T14:07:59.014-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 19 14:08:06 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:08:06 | 200 |  8.475536837s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:10:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:10:11 | 200 |  6.938443807s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.104-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9321512960 required="6.2 GiB"
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.105-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.7 GiB" free_swap="68.9 GiB"
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.105-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.106-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34483"
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.106-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.106-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.106-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 14:15:34 launchpad ollama[1064012]: INFO [main] build info | build=0 commit="unknown" tid="140055665532928" timestamp=1742418934
+Mar 19 14:15:34 launchpad ollama[1064012]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140055665532928" timestamp=1742418934 total_threads=16
+Mar 19 14:15:34 launchpad ollama[1064012]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34483" tid="140055665532928" timestamp=1742418934
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 14:15:34 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 14:15:34 launchpad ollama[1553]: time=2025-03-19T14:15:34.357-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 14:15:34 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 14:15:34 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 14:15:34 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 14:15:34 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 14:15:34 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 8192
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 14:15:35 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 14:15:35 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 14:15:35 launchpad ollama[1064012]: INFO [main] model loaded | tid="140055665532928" timestamp=1742418935
+Mar 19 14:15:35 launchpad ollama[1553]: time=2025-03-19T14:15:35.110-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 19 14:15:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:15:44 | 200 | 10.615297126s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:19:16 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:19:16 | 200 |  6.355186351s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.105-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9318891520 required="6.2 GiB"
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.105-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.105-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.106-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42893"
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.106-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.106-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.106-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 14:25:01 launchpad ollama[1065345]: INFO [main] build info | build=0 commit="unknown" tid="140046024835072" timestamp=1742419501
+Mar 19 14:25:01 launchpad ollama[1065345]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140046024835072" timestamp=1742419501 total_threads=16
+Mar 19 14:25:01 launchpad ollama[1065345]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42893" tid="140046024835072" timestamp=1742419501
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 14:25:01 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 14:25:01 launchpad ollama[1553]: time=2025-03-19T14:25:01.358-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 14:25:01 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 14:25:01 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 14:25:01 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 14:25:01 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 14:25:01 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 8192
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 14:25:02 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 14:25:02 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 14:25:02 launchpad ollama[1065345]: INFO [main] model loaded | tid="140046024835072" timestamp=1742419502
+Mar 19 14:25:02 launchpad ollama[1553]: time=2025-03-19T14:25:02.361-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 14:25:12 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:25:12 | 200 | 11.793706316s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:28:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:28:11 | 200 |  2.301137665s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:29:28 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:29:28 | 200 |  1.269617761s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:30:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:30:37 | 200 |  1.314152996s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:33:21 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:33:21 | 200 |  1.357600299s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:35:14 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:35:14 | 200 |  5.456157523s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:37:34 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:37:34 | 200 |  4.108627358s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:46:10 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:46:10 | 200 |      20.479µs |       127.0.0.1 | HEAD     "/"
+Mar 19 14:46:10 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:46:10 | 200 |    1.593569ms |       127.0.0.1 | GET      "/api/tags"
+Mar 19 14:46:24 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:46:24 | 200 |      15.723µs |       127.0.0.1 | HEAD     "/"
+Mar 19 14:46:24 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:46:24 | 200 |   12.732467ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 14:48:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:48:37 | 200 |      15.975µs |       127.0.0.1 | HEAD     "/"
+Mar 19 14:48:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:48:37 | 200 |   12.547593ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 14:56:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:56:44 | 200 |      15.458µs |       127.0.0.1 | HEAD     "/"
+Mar 19 14:56:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:56:44 | 200 |   12.842438ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 14:56:49 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:56:49 | 200 |      15.453µs |       127.0.0.1 | HEAD     "/"
+Mar 19 14:56:49 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:56:49 | 200 |   12.983892ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.143-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9286909952 required="6.2 GiB"
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.143-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.8 GiB" free_swap="68.9 GiB"
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.144-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.145-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40963"
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.145-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.145-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.145-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 14:56:50 launchpad ollama[1070359]: INFO [main] build info | build=0 commit="unknown" tid="139701962080256" timestamp=1742421410
+Mar 19 14:56:50 launchpad ollama[1070359]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139701962080256" timestamp=1742421410 total_threads=16
+Mar 19 14:56:50 launchpad ollama[1070359]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40963" tid="139701962080256" timestamp=1742421410
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 14:56:50 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 14:56:50 launchpad ollama[1553]: time=2025-03-19T14:56:50.397-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 14:56:50 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 14:56:50 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 14:56:50 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 14:56:50 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 14:56:50 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 8192
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 14:56:51 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 14:56:51 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 14:56:51 launchpad ollama[1070359]: INFO [main] model loaded | tid="139701962080256" timestamp=1742421411
+Mar 19 14:56:51 launchpad ollama[1553]: time=2025-03-19T14:56:51.149-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 19 14:56:51 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:56:51 | 200 |  1.192376819s |       127.0.0.1 | POST     "/api/generate"
+Mar 19 14:57:46 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:57:46 | 200 |  7.790673944s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:58:02 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:58:02 | 200 |  541.928223ms |       127.0.0.1 | POST     "/api/chat"
+Mar 19 14:58:41 launchpad ollama[1553]: [GIN] 2025/03/19 - 14:58:41 | 200 |  750.543335ms |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:00:08 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:00:08 | 200 |      24.017µs |       127.0.0.1 | HEAD     "/"
+Mar 19 15:00:08 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:00:08 | 200 |      55.217µs |       127.0.0.1 | GET      "/api/ps"
+Mar 19 15:05:52 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:05:52 | 200 |      16.498µs |       127.0.0.1 | HEAD     "/"
+Mar 19 15:05:52 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:05:52 | 200 |   12.533954ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 15:09:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:09:53 | 200 |      15.373µs |       127.0.0.1 | HEAD     "/"
+Mar 19 15:10:05 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:05 | 200 |       20.55µs |       127.0.0.1 | HEAD     "/"
+Mar 19 15:10:07 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:07 | 200 |      48.615µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 15:10:07 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:07 | 200 |   29.193307ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 15:10:14 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:14 | 200 |      14.646µs |       127.0.0.1 | HEAD     "/"
+Mar 19 15:10:14 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:14 | 200 |     766.332µs |       127.0.0.1 | GET      "/api/tags"
+Mar 19 15:10:35 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:35 | 200 |      22.106µs |       127.0.0.1 | HEAD     "/"
+Mar 19 15:10:35 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:35 | 200 |   12.866448ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.909-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9301458944 required="7.7 GiB"
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.909-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.7 GiB" free_swap="68.9 GiB"
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.910-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.911-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 35497"
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.911-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.911-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 15:10:35 launchpad ollama[1553]: time=2025-03-19T15:10:35.911-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 15:10:35 launchpad ollama[1073468]: INFO [main] build info | build=0 commit="unknown" tid="139727709519872" timestamp=1742422235
+Mar 19 15:10:35 launchpad ollama[1073468]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139727709519872" timestamp=1742422235 total_threads=16
+Mar 19 15:10:35 launchpad ollama[1073468]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35497" tid="139727709519872" timestamp=1742422235
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 15:10:35 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 15:10:36 launchpad ollama[1553]: time=2025-03-19T15:10:36.163-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 15:10:36 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 15:10:36 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 15:10:36 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 15:10:36 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 15:10:36 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 15:10:36 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 15:10:36 launchpad ollama[1073468]: INFO [main] model loaded | tid="139727709519872" timestamp=1742422236
+Mar 19 15:10:36 launchpad ollama[1553]: time=2025-03-19T15:10:36.920-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 19 15:10:36 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:36 | 200 |  1.198454452s |       127.0.0.1 | POST     "/api/generate"
+Mar 19 15:10:49 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:10:49 | 200 |  6.835704028s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.524-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9278390272 required="7.7 GiB"
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.524-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.7 GiB" free_swap="68.9 GiB"
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.524-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.525-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 38595"
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.525-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.525-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.525-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 15:16:09 launchpad ollama[1074353]: INFO [main] build info | build=0 commit="unknown" tid="140121490862080" timestamp=1742422569
+Mar 19 15:16:09 launchpad ollama[1074353]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140121490862080" timestamp=1742422569 total_threads=16
+Mar 19 15:16:09 launchpad ollama[1074353]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38595" tid="140121490862080" timestamp=1742422569
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 15:16:09 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 15:16:09 launchpad ollama[1553]: time=2025-03-19T15:16:09.776-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 15:16:09 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 15:16:09 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 15:16:09 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 15:16:09 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 15:16:09 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 15:16:10 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 15:16:10 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 15:16:10 launchpad ollama[1074353]: INFO [main] model loaded | tid="140121490862080" timestamp=1742422570
+Mar 19 15:16:10 launchpad ollama[1553]: time=2025-03-19T15:16:10.535-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 19 15:16:20 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:16:20 | 200 | 11.446151344s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.171-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9278390272 required="7.7 GiB"
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.171-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.7 GiB" free_swap="68.9 GiB"
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.171-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.172-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 37839"
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.172-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.172-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.172-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 15:24:01 launchpad ollama[1075479]: INFO [main] build info | build=0 commit="unknown" tid="139992453177344" timestamp=1742423041
+Mar 19 15:24:01 launchpad ollama[1075479]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139992453177344" timestamp=1742423041 total_threads=16
+Mar 19 15:24:01 launchpad ollama[1075479]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37839" tid="139992453177344" timestamp=1742423041
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 15:24:01 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 15:24:01 launchpad ollama[1553]: time=2025-03-19T15:24:01.423-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 15:24:01 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 15:24:01 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 15:24:01 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 15:24:01 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 15:24:01 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 15:24:02 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 15:24:02 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 15:24:02 launchpad ollama[1075479]: INFO [main] model loaded | tid="139992453177344" timestamp=1742423042
+Mar 19 15:24:02 launchpad ollama[1553]: time=2025-03-19T15:24:02.427-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 15:24:12 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:24:12 | 200 | 11.890800678s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.189-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9281011712 required="7.7 GiB"
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.189-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.189-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.190-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 46409"
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.190-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.190-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.191-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 15:50:50 launchpad ollama[1079902]: INFO [main] build info | build=0 commit="unknown" tid="140440360804352" timestamp=1742424650
+Mar 19 15:50:50 launchpad ollama[1079902]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140440360804352" timestamp=1742424650 total_threads=16
+Mar 19 15:50:50 launchpad ollama[1079902]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46409" tid="140440360804352" timestamp=1742424650
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 15:50:50 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 15:50:50 launchpad ollama[1553]: time=2025-03-19T15:50:50.442-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 15:50:50 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 15:50:50 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 15:50:50 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 15:50:50 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 15:50:50 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 15:50:51 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 15:50:51 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 15:50:51 launchpad ollama[1079902]: INFO [main] model loaded | tid="140440360804352" timestamp=1742424651
+Mar 19 15:50:51 launchpad ollama[1553]: time=2025-03-19T15:50:51.445-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 15:50:59 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:50:59 | 200 |  9.199214165s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:52:45 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:52:45 | 200 |  8.535873447s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:53:36 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:53:36 | 200 | 10.706122932s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:55:18 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:55:18 | 200 |  8.647136848s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 15:58:15 launchpad ollama[1553]: [GIN] 2025/03/19 - 15:58:15 | 200 |  8.602667884s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:00:16 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:00:16 | 200 |  8.253027637s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:04:16 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:04:16 | 200 | 11.007915608s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.592-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9281536000 required="7.7 GiB"
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.592-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.593-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.594-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 39179"
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.594-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.594-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.594-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 16:12:39 launchpad ollama[1083210]: INFO [main] build info | build=0 commit="unknown" tid="140212455632896" timestamp=1742425959
+Mar 19 16:12:39 launchpad ollama[1083210]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140212455632896" timestamp=1742425959 total_threads=16
+Mar 19 16:12:39 launchpad ollama[1083210]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39179" tid="140212455632896" timestamp=1742425959
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 16:12:39 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 16:12:39 launchpad ollama[1553]: time=2025-03-19T16:12:39.845-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 16:12:39 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 16:12:39 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 16:12:39 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 16:12:39 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 16:12:39 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 16:12:40 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 16:12:40 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 16:12:40 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 16:12:40 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 16:12:40 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 16:12:40 launchpad ollama[1083210]: INFO [main] model loaded | tid="140212455632896" timestamp=1742425960
+Mar 19 16:12:40 launchpad ollama[1553]: time=2025-03-19T16:12:40.848-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 16:12:59 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:12:59 | 200 | 19.734470957s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:14:34 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:14:34 | 200 |  12.53505037s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.174-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9281536000 required="7.7 GiB"
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.174-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.175-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.175-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 38891"
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.176-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.176-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.176-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 16:24:32 launchpad ollama[1084909]: INFO [main] build info | build=0 commit="unknown" tid="140123967619072" timestamp=1742426672
+Mar 19 16:24:32 launchpad ollama[1084909]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140123967619072" timestamp=1742426672 total_threads=16
+Mar 19 16:24:32 launchpad ollama[1084909]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38891" tid="140123967619072" timestamp=1742426672
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 16:24:32 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 16:24:32 launchpad ollama[1553]: time=2025-03-19T16:24:32.427-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 16:24:32 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 16:24:32 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 16:24:32 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 16:24:32 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 16:24:32 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 16:24:33 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 16:24:33 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 16:24:33 launchpad ollama[1084909]: INFO [main] model loaded | tid="140123967619072" timestamp=1742426673
+Mar 19 16:24:33 launchpad ollama[1553]: time=2025-03-19T16:24:33.431-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 16:24:52 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:24:52 | 200 | 20.794159326s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:29:23 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:29:23 | 200 |  8.824535282s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:30:46 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:30:46 | 200 |  6.784859875s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:37:12 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:37:12 | 200 |         4m41s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.147-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9280552960 required="7.7 GiB"
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.147-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.147-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.148-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 41117"
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.148-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.148-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.149-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 16:45:25 launchpad ollama[1087999]: INFO [main] build info | build=0 commit="unknown" tid="140624371212288" timestamp=1742427925
+Mar 19 16:45:25 launchpad ollama[1087999]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140624371212288" timestamp=1742427925 total_threads=16
+Mar 19 16:45:25 launchpad ollama[1087999]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41117" tid="140624371212288" timestamp=1742427925
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 16:45:25 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 16:45:25 launchpad ollama[1553]: time=2025-03-19T16:45:25.399-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 16:45:25 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 16:45:25 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 16:45:25 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 16:45:25 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 16:45:25 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 16:45:26 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 16:45:26 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 16:45:26 launchpad ollama[1087999]: INFO [main] model loaded | tid="140624371212288" timestamp=1742427926
+Mar 19 16:45:26 launchpad ollama[1553]: time=2025-03-19T16:45:26.403-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 16:46:18 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:46:18 | 200 | 53.479928364s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:47:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:47:30 | 200 |   12.893698ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 16:50:22 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:50:22 | 200 |   12.498092ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 16:50:48 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:50:48 | 200 |   12.886171ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 16:55:32 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:55:32 | 200 |      15.615µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:55:32 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:55:32 | 200 |     453.855µs |       127.0.0.1 | GET      "/api/tags"
+Mar 19 16:55:55 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:55:55 | 200 |      50.492µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:55:55 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:55:55 | 200 |     1.44822ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 19 16:55:58 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:55:58 | 200 |      31.101µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:55:58 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:55:58 | 200 |     468.123µs |       127.0.0.1 | GET      "/api/tags"
+Mar 19 16:56:18 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:18 | 200 |      15.452µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:56:21 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:21 | 200 |      67.654µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 16:56:21 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:21 | 200 |   26.437434ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 16:56:33 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:33 | 200 |      15.129µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:56:33 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:33 | 200 |    1.261061ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 19 16:56:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:37 | 200 |      15.369µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:56:40 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:40 | 200 |      91.099µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 16:56:40 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:40 | 200 |   28.411029ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 16:56:48 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:48 | 200 |      19.801µs |       127.0.0.1 | HEAD     "/"
+Mar 19 16:56:48 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:48 | 200 |   12.680799ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.870-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9280552960 required="7.7 GiB"
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.870-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.871-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.872-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 43755"
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.872-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.872-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 16:56:48 launchpad ollama[1553]: time=2025-03-19T16:56:48.872-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 16:56:48 launchpad ollama[1090073]: INFO [main] build info | build=0 commit="unknown" tid="140007767351296" timestamp=1742428608
+Mar 19 16:56:48 launchpad ollama[1090073]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140007767351296" timestamp=1742428608 total_threads=16
+Mar 19 16:56:48 launchpad ollama[1090073]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43755" tid="140007767351296" timestamp=1742428608
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 16:56:48 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 16:56:49 launchpad ollama[1553]: time=2025-03-19T16:56:49.123-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 16:56:49 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 16:56:49 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 16:56:49 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 16:56:49 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 16:56:49 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 16:56:49 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 16:56:49 launchpad ollama[1090073]: INFO [main] model loaded | tid="140007767351296" timestamp=1742428609
+Mar 19 16:56:50 launchpad ollama[1553]: time=2025-03-19T16:56:50.126-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 16:56:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:56:50 | 200 |  1.436506149s |       127.0.0.1 | POST     "/api/generate"
+Mar 19 16:59:16 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:59:16 | 200 |  3.779718793s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 16:59:40 launchpad ollama[1553]: [GIN] 2025/03/19 - 16:59:40 | 200 |   3.14516953s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:00:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:00:50 | 200 |      14.881µs |       127.0.0.1 | HEAD     "/"
+Mar 19 17:00:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:00:50 | 200 |    1.513938ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 19 17:00:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:00:53 | 200 |      16.591µs |       127.0.0.1 | HEAD     "/"
+Mar 19 17:00:56 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:00:56 | 200 |      47.845µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 17:00:56 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:00:56 | 200 |   26.827483ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 17:01:01 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:01:01 | 200 |      16.344µs |       127.0.0.1 | HEAD     "/"
+Mar 19 17:01:01 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:01:01 | 200 |    13.00695ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 17:01:01 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:01:01 | 200 |    13.15404ms |       127.0.0.1 | POST     "/api/generate"
+Mar 19 17:01:05 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:01:05 | 200 |  1.045303354s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:01:24 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:01:24 | 200 |  1.878753637s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.820-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9280552960 required="7.7 GiB"
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.821-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.6 GiB" free_swap="68.9 GiB"
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.821-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.822-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 35723"
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.822-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.822-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 17:09:14 launchpad ollama[1553]: time=2025-03-19T17:09:14.822-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 17:09:14 launchpad ollama[1092033]: INFO [main] build info | build=0 commit="unknown" tid="139957758668800" timestamp=1742429354
+Mar 19 17:09:14 launchpad ollama[1092033]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139957758668800" timestamp=1742429354 total_threads=16
+Mar 19 17:09:14 launchpad ollama[1092033]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35723" tid="139957758668800" timestamp=1742429354
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 17:09:14 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 17:09:15 launchpad ollama[1553]: time=2025-03-19T17:09:15.073-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 17:09:15 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 17:09:15 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 17:09:15 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 17:09:15 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 17:09:15 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 17:09:15 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 17:09:15 launchpad ollama[1092033]: INFO [main] model loaded | tid="139957758668800" timestamp=1742429355
+Mar 19 17:09:15 launchpad ollama[1553]: time=2025-03-19T17:09:15.830-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 19 17:09:26 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:09:26 | 200 | 11.549671136s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.028-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9280552960 required="7.7 GiB"
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.028-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.5 GiB" free_swap="68.9 GiB"
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.028-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.029-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 42231"
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.029-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.029-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.029-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 17:17:10 launchpad ollama[1093171]: INFO [main] build info | build=0 commit="unknown" tid="140655190937600" timestamp=1742429830
+Mar 19 17:17:10 launchpad ollama[1093171]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140655190937600" timestamp=1742429830 total_threads=16
+Mar 19 17:17:10 launchpad ollama[1093171]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42231" tid="140655190937600" timestamp=1742429830
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 17:17:10 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 17:17:10 launchpad ollama[1553]: time=2025-03-19T17:17:10.280-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 17:17:10 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 17:17:10 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 17:17:10 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 17:17:10 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 17:17:10 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 17:17:10 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 17:17:11 launchpad ollama[1093171]: INFO [main] model loaded | tid="140655190937600" timestamp=1742429831
+Mar 19 17:17:11 launchpad ollama[1553]: time=2025-03-19T17:17:11.283-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 17:17:20 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:17:20 | 200 | 11.089850045s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:18:34 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:18:34 | 200 |  8.608081362s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:22:28 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:22:28 | 200 |  8.072222972s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:24:34 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:24:34 | 200 |  1.238467883s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:25:52 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:25:52 | 200 |  1.353795697s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:26:07 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:26:07 | 200 |   12.478808ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 17:26:17 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:26:17 | 200 |  1.430607912s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:26:47 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:26:47 | 200 |   12.951997ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 17:27:09 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:27:09 | 200 |   1.79384077s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:27:49 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:27:49 | 200 |   12.373791ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 17:28:02 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:28:02 | 200 |  8.495199451s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:32:40 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:32:40 | 200 |  2.553248593s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:32:52 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:32:52 | 200 |  7.599296148s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:33:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:33:53 | 200 |  2.814389526s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:34:06 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:34:06 | 200 |  9.097858226s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:34:54 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:34:54 | 200 |  5.215343626s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:35:07 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:35:07 | 200 |  3.618264464s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:35:21 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:35:21 | 200 |  3.527019179s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:36:12 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:36:12 | 200 |  7.495950982s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:36:18 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:36:18 | 200 |  4.124787454s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:36:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:36:30 | 200 |  5.509177899s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:37:05 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:37:05 | 200 |  5.768666334s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:37:19 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:37:19 | 200 |   6.10250155s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:38:01 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:38:01 | 200 | 11.900695987s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:38:19 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:38:19 | 200 | 12.448423864s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:39:06 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:39:06 | 200 |   13.272028ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 17:39:25 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:39:25 | 200 | 16.280344686s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 17:39:56 launchpad ollama[1553]: [GIN] 2025/03/19 - 17:39:56 | 200 | 16.733725425s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.243-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9279242240 required="7.7 GiB"
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.243-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.5 GiB" free_swap="68.9 GiB"
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.243-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.244-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 41107"
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.245-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.245-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.245-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 18:15:10 launchpad ollama[1101363]: INFO [main] build info | build=0 commit="unknown" tid="140290009657344" timestamp=1742433310
+Mar 19 18:15:10 launchpad ollama[1101363]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140290009657344" timestamp=1742433310 total_threads=16
+Mar 19 18:15:10 launchpad ollama[1101363]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41107" tid="140290009657344" timestamp=1742433310
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 18:15:10 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 18:15:10 launchpad ollama[1553]: time=2025-03-19T18:15:10.495-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 18:15:10 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 18:15:10 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 18:15:10 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 18:15:10 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 18:15:10 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 18:15:11 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 18:15:11 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 18:15:11 launchpad ollama[1101363]: INFO [main] model loaded | tid="140290009657344" timestamp=1742433311
+Mar 19 18:15:11 launchpad ollama[1553]: time=2025-03-19T18:15:11.499-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 18:15:27 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:15:27 | 200 | 17.277527942s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:31:16 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:31:16 | 200 |        15m13s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:32:23 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:32:23 | 200 |  5.721499854s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:32:51 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:32:51 | 200 |  6.442060577s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:33:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:33:53 | 200 |  3.703942131s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:34:31 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:34:31 | 200 |  2.968032978s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:36:24 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:36:24 | 200 |   2.75354646s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:36:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:36:53 | 200 |  4.395223699s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:37:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:37:11 | 200 |  7.272151729s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:37:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:37:53 | 200 |  6.228299736s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.053-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9264496640 required="7.7 GiB"
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.053-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.4 GiB" free_swap="68.9 GiB"
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.053-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.054-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 37565"
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.054-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.054-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.054-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 18:46:49 launchpad ollama[1106074]: INFO [main] build info | build=0 commit="unknown" tid="139917702578176" timestamp=1742435209
+Mar 19 18:46:49 launchpad ollama[1106074]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139917702578176" timestamp=1742435209 total_threads=16
+Mar 19 18:46:49 launchpad ollama[1106074]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37565" tid="139917702578176" timestamp=1742435209
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 18:46:49 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 18:46:49 launchpad ollama[1553]: time=2025-03-19T18:46:49.305-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 18:46:49 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 18:46:49 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 18:46:49 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 18:46:49 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 18:46:49 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 18:46:49 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 18:46:50 launchpad ollama[1106074]: INFO [main] model loaded | tid="139917702578176" timestamp=1742435210
+Mar 19 18:46:50 launchpad ollama[1553]: time=2025-03-19T18:46:50.061-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 19 18:47:00 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:47:00 | 200 | 11.785600436s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:49:46 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:49:46 | 200 |  8.865631358s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:51:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:51:30 | 200 | 11.177185966s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:52:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:52:44 | 200 |  8.542652753s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:56:32 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:56:32 | 200 |  10.46354348s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:58:00 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:58:00 | 200 | 12.471833057s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 18:59:36 launchpad ollama[1553]: [GIN] 2025/03/19 - 18:59:36 | 200 |  2.481295797s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:00:03 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:00:03 | 200 |  8.969396025s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:00:41 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:00:41 | 200 |  9.171149938s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:01:27 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:01:27 | 200 |  7.960006373s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:01:54 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:01:54 | 200 | 11.638280693s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:02:13 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:02:13 | 200 |  9.969332277s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:02:33 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:02:33 | 200 | 14.522601219s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:05:25 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:05:25 | 200 |         1m45s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:05:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:05:50 | 200 | 10.538393198s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:07:06 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:07:06 | 200 |  5.947861596s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:07:36 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:07:36 | 200 | 18.144623386s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:08:34 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:08:34 | 200 |   13.045318ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:08:43 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:08:43 | 200 |  4.751733702s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:09:18 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:09:18 | 200 |  4.804210665s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:10:22 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:10:22 | 200 |  5.464388284s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:11:39 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:11:39 | 200 |  4.175226817s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:13:15 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:13:15 | 200 |  3.617657116s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:13:58 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:13:58 | 200 |   12.832254ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:14:14 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:14:14 | 200 |   12.346782ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:14:28 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:14:28 | 200 |   13.036026ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:14:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:14:37 | 200 |   12.309953ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:14:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:14:53 | 200 |   12.602511ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:15:46 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:15:46 | 200 |  4.035332668s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:16:53 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:16:53 | 200 |  5.417289161s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:18:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:18:30 | 200 |  2.049013566s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:19:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:19:50 | 200 |  1.872510983s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:20:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:20:44 | 200 |   12.679108ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:21:59 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:21:59 | 200 |  2.108290057s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:22:01 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:22:01 | 200 |  2.060780391s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:26:27 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:27 | 200 |      16.242µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:26:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:30 | 200 |      46.924µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 19:26:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:30 | 200 |   23.596124ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 19:26:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:37 | 200 |      26.615µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:26:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:37 | 200 |   12.355854ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:26:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:37 | 200 |   12.726734ms |       127.0.0.1 | POST     "/api/generate"
+Mar 19 19:26:47 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:26:47 | 200 |  3.857925263s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:27:33 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:27:33 | 200 |   5.58264154s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:29:02 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:29:02 | 200 |   8.70896357s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:30:17 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:30:17 | 200 |  5.825537391s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:31:09 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:31:09 | 200 |  5.019266652s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:32:18 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:32:18 | 200 |  6.983706554s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:33:38 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:33:38 | 200 |  6.042807373s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:35:26 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:35:26 | 200 |  7.537894802s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:36:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:36:37 | 200 |      16.182µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:36:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:36:37 | 200 |    1.450239ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 19 19:42:38 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:42:38 | 200 |      15.143µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:42:41 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:42:41 | 200 |      55.022µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 19:42:41 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:42:41 | 200 |   24.173529ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 19:43:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:11 | 200 |      15.006µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:43:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:11 | 404 |      60.392µs |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:43:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:11 | 400 |     344.364µs |       127.0.0.1 | POST     "/api/pull"
+Mar 19 19:43:20 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:20 | 200 |      16.487µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:43:20 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:20 | 200 |   12.909453ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.063-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9074900992 required="7.7 GiB"
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.063-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.3 GiB" free_swap="68.9 GiB"
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.063-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.064-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 41931"
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.064-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.064-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.065-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 19:43:21 launchpad ollama[1115036]: INFO [main] build info | build=0 commit="unknown" tid="139693597380608" timestamp=1742438601
+Mar 19 19:43:21 launchpad ollama[1115036]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139693597380608" timestamp=1742438601 total_threads=16
+Mar 19 19:43:21 launchpad ollama[1115036]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41931" tid="139693597380608" timestamp=1742438601
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 19:43:21 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 19:43:21 launchpad ollama[1553]: time=2025-03-19T19:43:21.315-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 19:43:21 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 19:43:21 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 19:43:21 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 19:43:21 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 19:43:21 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 19:43:21 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 19:43:21 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 19:43:22 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 19:43:22 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 19:43:22 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 19:43:22 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 19:43:22 launchpad ollama[1115036]: INFO [main] model loaded | tid="139693597380608" timestamp=1742438602
+Mar 19 19:43:22 launchpad ollama[1553]: time=2025-03-19T19:43:22.318-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 19:43:22 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:22 | 200 |  1.437816344s |       127.0.0.1 | POST     "/api/generate"
+Mar 19 19:43:30 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:43:30 | 200 |  4.306321761s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:44:33 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:44:33 | 200 |  8.974803804s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:47:09 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:47:09 | 200 |  7.410393098s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:54:48 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:54:48 | 200 |      16.315µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:54:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:54:50 | 200 |      47.754µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 19 19:54:50 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:54:50 | 200 |   24.445941ms |       127.0.0.1 | POST     "/api/create"
+Mar 19 19:54:59 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:54:59 | 200 |      15.258µs |       127.0.0.1 | HEAD     "/"
+Mar 19 19:54:59 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:54:59 | 200 |   12.857306ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.952-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9148235776 required="7.7 GiB"
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.952-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.2 GiB" free_swap="68.9 GiB"
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.952-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.953-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama352866884/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 41817"
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.953-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.953-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 19 19:54:59 launchpad ollama[1553]: time=2025-03-19T19:54:59.953-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 19 19:54:59 launchpad ollama[1117144]: INFO [main] build info | build=0 commit="unknown" tid="140643379798016" timestamp=1742439299
+Mar 19 19:54:59 launchpad ollama[1117144]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140643379798016" timestamp=1742439299 total_threads=16
+Mar 19 19:54:59 launchpad ollama[1117144]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41817" tid="140643379798016" timestamp=1742439299
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - type  f32:   65 tensors
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - type q4_0:  225 tensors
+Mar 19 19:55:00 launchpad ollama[1553]: llama_model_loader: - type q6_K:    1 tensors
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_vocab: special tokens cache size = 256
+Mar 19 19:55:00 launchpad ollama[1553]: time=2025-03-19T19:55:00.205-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: arch             = llama
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: vocab type       = BPE
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_vocab          = 128256
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_merges         = 280147
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: vocab_only       = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_embd           = 4096
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_layer          = 32
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_head           = 32
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_head_kv        = 8
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_rot            = 128
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_swa            = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_gqa            = 4
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_ff             = 14336
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_expert         = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_expert_used    = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: causal attn      = 1
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: pooling type     = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: rope type        = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: rope scaling     = linear
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: freq_scale_train = 1
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: ssm_d_state      = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: model type       = 8B
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: model ftype      = Q4_0
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: model params     = 8.03 B
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_print_meta: max token length = 256
+Mar 19 19:55:00 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 19 19:55:00 launchpad ollama[1553]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 19 19:55:00 launchpad ollama[1553]: ggml_cuda_init: found 1 CUDA devices:
+Mar 19 19:55:00 launchpad ollama[1553]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: n_ctx      = 16384
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: n_batch    = 512
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: n_ubatch   = 512
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: flash_attn = 0
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: freq_scale = 1
+Mar 19 19:55:00 launchpad ollama[1553]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: graph nodes  = 1030
+Mar 19 19:55:00 launchpad ollama[1553]: llama_new_context_with_model: graph splits = 2
+Mar 19 19:55:00 launchpad ollama[1117144]: INFO [main] model loaded | tid="140643379798016" timestamp=1742439300
+Mar 19 19:55:01 launchpad ollama[1553]: time=2025-03-19T19:55:01.208-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 19 19:55:01 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:55:01 | 200 |  1.442368701s |       127.0.0.1 | POST     "/api/generate"
+Mar 19 19:55:08 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:55:08 | 200 |  3.249757371s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:55:37 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:55:37 | 200 |   12.501524ms |       127.0.0.1 | POST     "/api/show"
+Mar 19 19:56:11 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:56:11 | 200 |  9.066361293s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:57:46 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:57:46 | 200 |  3.335807135s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:58:28 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:58:28 | 200 |  3.875927335s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 19:59:31 launchpad ollama[1553]: [GIN] 2025/03/19 - 19:59:31 | 200 |  5.199894045s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 20:00:15 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:00:15 | 200 |  5.955558681s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 20:01:56 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:01:56 | 200 |  5.641119179s |       127.0.0.1 | POST     "/api/chat"
+Mar 19 20:54:26 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:54:26 | 200 |       19.73µs |       127.0.0.1 | HEAD     "/"
+Mar 19 20:54:26 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:54:26 | 200 |     602.314µs |       127.0.0.1 | GET      "/api/tags"
+Mar 19 20:55:47 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:55:47 | 200 |      16.687µs |       127.0.0.1 | HEAD     "/"
+Mar 19 20:55:47 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:55:47 | 200 |    3.535167ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 19 20:56:00 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:56:00 | 200 |      15.086µs |       127.0.0.1 | HEAD     "/"
+Mar 19 20:56:00 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:56:00 | 200 |    3.192762ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 19 20:56:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:56:44 | 200 |      14.135µs |       127.0.0.1 | HEAD     "/"
+Mar 19 20:56:44 launchpad ollama[1553]: [GIN] 2025/03/19 - 20:56:44 | 200 |      15.995µs |       127.0.0.1 | GET      "/api/ps"
+Mar 20 17:21:26 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 20 17:21:26 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 20 17:21:26 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 20 17:21:26 launchpad systemd[1]: ollama.service: Consumed 43min 48.241s CPU time, 12.6G memory peak, 251.3M memory swap peak, 11.5G read from disk, 759.6M written to disk.
+-- Boot bf778c65c1c7444eafed20125c5a887b --
+Mar 20 17:22:03 launchpad systemd[1]: Starting Server for local large language models...
+Mar 20 17:22:03 launchpad systemd[1]: Started Server for local large language models.
+Mar 20 17:22:03 launchpad ollama[1530]: 2025/03/20 17:22:03 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 20 17:22:03 launchpad ollama[1530]: time=2025-03-20T17:22:03.293-07:00 level=INFO source=images.go:753 msg="total blobs: 26"
+Mar 20 17:22:03 launchpad ollama[1530]: time=2025-03-20T17:22:03.305-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 1"
+Mar 20 17:22:03 launchpad ollama[1530]: time=2025-03-20T17:22:03.307-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 20 17:22:03 launchpad ollama[1530]: time=2025-03-20T17:22:03.309-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3643862997/runners
+Mar 20 17:22:06 launchpad ollama[1530]: time=2025-03-20T17:22:06.346-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 20 17:22:06 launchpad ollama[1530]: time=2025-03-20T17:22:06.347-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 20 17:22:06 launchpad ollama[1530]: time=2025-03-20T17:22:06.347-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 20 17:22:06 launchpad ollama[1530]: time=2025-03-20T17:22:06.347-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 20 17:22:06 launchpad ollama[1530]: time=2025-03-20T17:22:06.347-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 20 17:22:06 launchpad ollama[1530]: time=2025-03-20T17:22:06.565-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 20 17:42:51 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:42:51 | 200 |    3.366137ms |       127.0.0.1 | GET      "/api/tags"
+Mar 20 17:43:36 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:43:36 | 200 |     481.038µs |       127.0.0.1 | GET      "/api/tags"
+Mar 20 17:43:36 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:43:36 | 200 |     514.798µs |       127.0.0.1 | GET      "/api/version"
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.572-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10108076032 required="7.7 GiB"
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.572-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.572-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.573-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3643862997/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 36721"
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.574-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.574-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.574-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 20 17:44:34 launchpad ollama[13493]: INFO [main] build info | build=0 commit="unknown" tid="139779810324480" timestamp=1742517874
+Mar 20 17:44:34 launchpad ollama[13493]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139779810324480" timestamp=1742517874 total_threads=16
+Mar 20 17:44:34 launchpad ollama[13493]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36721" tid="139779810324480" timestamp=1742517874
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - type  f32:   65 tensors
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - type q4_0:  225 tensors
+Mar 20 17:44:34 launchpad ollama[1530]: llama_model_loader: - type q6_K:    1 tensors
+Mar 20 17:44:34 launchpad ollama[1530]: time=2025-03-20T17:44:34.825-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_vocab: special tokens cache size = 256
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: arch             = llama
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: vocab type       = BPE
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_vocab          = 128256
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_merges         = 280147
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: vocab_only       = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_embd           = 4096
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_layer          = 32
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_head           = 32
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_head_kv        = 8
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_rot            = 128
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_swa            = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_gqa            = 4
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_ff             = 14336
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_expert         = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_expert_used    = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: causal attn      = 1
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: pooling type     = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: rope type        = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: rope scaling     = linear
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: freq_scale_train = 1
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: ssm_d_state      = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: model type       = 8B
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: model ftype      = Q4_0
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: model params     = 8.03 B
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_print_meta: max token length = 256
+Mar 20 17:44:34 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 20 17:44:34 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 20 17:44:34 launchpad ollama[1530]: ggml_cuda_init: found 1 CUDA devices:
+Mar 20 17:44:34 launchpad ollama[1530]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 20 17:44:34 launchpad ollama[1530]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 20 17:44:40 launchpad ollama[1530]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 20 17:44:40 launchpad ollama[1530]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 20 17:44:40 launchpad ollama[1530]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: n_ctx      = 16384
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: n_batch    = 512
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: n_ubatch   = 512
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: flash_attn = 0
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: freq_scale = 1
+Mar 20 17:44:40 launchpad ollama[1530]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: graph nodes  = 1030
+Mar 20 17:44:40 launchpad ollama[1530]: llama_new_context_with_model: graph splits = 2
+Mar 20 17:44:40 launchpad ollama[13493]: INFO [main] model loaded | tid="139779810324480" timestamp=1742517880
+Mar 20 17:44:40 launchpad ollama[1530]: time=2025-03-20T17:44:40.842-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Mar 20 17:44:43 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:44:43 | 200 |  9.257646341s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:44:44 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:44:44 | 200 |  583.080887ms |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:44:45 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:44:45 | 200 |  1.466299486s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:45:57 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:45:57 | 200 |  5.815849459s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:47:14 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:47:14 | 200 |  6.980502359s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:49:18 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:49:18 | 200 |  5.583089859s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:50:19 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:50:19 | 200 |  7.638621575s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:54:36 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:54:36 | 200 |  7.323435618s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 17:57:30 launchpad ollama[1530]: [GIN] 2025/03/20 - 17:57:30 | 200 |  8.047903525s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:00:19 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:00:19 | 200 |  6.289686262s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:02:11 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:02:11 | 200 |  8.495516305s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:05:02 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:05:02 | 200 |      40.555µs |       127.0.0.1 | GET      "/api/version"
+Mar 20 18:36:03 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:36:03 | 200 |      41.118µs |       127.0.0.1 | GET      "/api/version"
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.640-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10102964224 required="7.7 GiB"
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.640-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.640-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.641-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3643862997/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 46093"
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.641-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.641-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.642-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 20 18:36:37 launchpad ollama[55558]: INFO [main] build info | build=0 commit="unknown" tid="140706518519808" timestamp=1742520997
+Mar 20 18:36:37 launchpad ollama[55558]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140706518519808" timestamp=1742520997 total_threads=16
+Mar 20 18:36:37 launchpad ollama[55558]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46093" tid="140706518519808" timestamp=1742520997
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - type  f32:   65 tensors
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - type q4_0:  225 tensors
+Mar 20 18:36:37 launchpad ollama[1530]: llama_model_loader: - type q6_K:    1 tensors
+Mar 20 18:36:37 launchpad ollama[1530]: time=2025-03-20T18:36:37.893-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_vocab: special tokens cache size = 256
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: arch             = llama
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: vocab type       = BPE
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_vocab          = 128256
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_merges         = 280147
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: vocab_only       = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_embd           = 4096
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_layer          = 32
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_head           = 32
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_head_kv        = 8
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_rot            = 128
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_swa            = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_gqa            = 4
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_ff             = 14336
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_expert         = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_expert_used    = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: causal attn      = 1
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: pooling type     = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: rope type        = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: rope scaling     = linear
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: freq_scale_train = 1
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: ssm_d_state      = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: model type       = 8B
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: model ftype      = Q4_0
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: model params     = 8.03 B
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 20 18:36:37 launchpad ollama[1530]: llm_load_print_meta: max token length = 256
+Mar 20 18:36:37 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 20 18:36:37 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 20 18:36:37 launchpad ollama[1530]: ggml_cuda_init: found 1 CUDA devices:
+Mar 20 18:36:37 launchpad ollama[1530]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 20 18:36:38 launchpad ollama[1530]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 20 18:36:38 launchpad ollama[1530]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 20 18:36:38 launchpad ollama[1530]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 20 18:36:38 launchpad ollama[1530]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: n_ctx      = 16384
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: n_batch    = 512
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: n_ubatch   = 512
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: flash_attn = 0
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: freq_scale = 1
+Mar 20 18:36:38 launchpad ollama[1530]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: graph nodes  = 1030
+Mar 20 18:36:38 launchpad ollama[1530]: llama_new_context_with_model: graph splits = 2
+Mar 20 18:36:38 launchpad ollama[55558]: INFO [main] model loaded | tid="140706518519808" timestamp=1742520998
+Mar 20 18:36:38 launchpad ollama[1530]: time=2025-03-20T18:36:38.895-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 20 18:36:53 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:36:53 | 200 | 15.969996668s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:39:50 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:39:50 | 200 |  6.703338511s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:41:51 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:41:51 | 200 |  6.534414397s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:45:03 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:45:03 | 200 | 10.047744622s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:47:31 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:47:31 | 200 |  7.488868866s |       127.0.0.1 | POST     "/api/chat"
+Mar 20 18:52:22 launchpad ollama[1530]: [GIN] 2025/03/20 - 18:52:22 | 200 |   7.42625173s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 11:18:44 launchpad ollama[1530]: [GIN] 2025/03/21 - 11:18:44 | 200 |      440.15µs |       127.0.0.1 | GET      "/api/tags"
+Mar 21 11:18:44 launchpad ollama[1530]: [GIN] 2025/03/21 - 11:18:44 | 200 |      43.008µs |       127.0.0.1 | GET      "/api/version"
+Mar 21 13:07:44 launchpad ollama[1530]: [GIN] 2025/03/21 - 13:07:44 | 200 |     503.978µs |       127.0.0.1 | GET      "/api/tags"
+Mar 21 14:24:55 launchpad ollama[1530]: [GIN] 2025/03/21 - 14:24:55 | 200 |     453.444µs |       127.0.0.1 | GET      "/api/tags"
+Mar 21 14:36:18 launchpad ollama[1530]: [GIN] 2025/03/21 - 14:36:18 | 200 |      30.713µs |       127.0.0.1 | GET      "/api/version"
+Mar 21 15:08:11 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:08:11 | 200 |      56.519µs |       127.0.0.1 | GET      "/api/version"
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.246-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=6543376384 required="5.1 GiB"
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.246-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="40.5 GiB" free_swap="68.9 GiB"
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.247-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[6.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="5.1 GiB" memory.required.partial="5.1 GiB" memory.required.kv="256.0 MiB" memory.required.allocations="[5.1 GiB]" memory.weights.total="3.9 GiB" memory.weights.repeating="3.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="258.5 MiB" memory.graph.partial="677.5 MiB"
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.248-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3643862997/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 36369"
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.248-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.248-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.248-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 21 15:09:04 launchpad ollama[538108]: INFO [main] build info | build=0 commit="unknown" tid="140551136841728" timestamp=1742594944
+Mar 21 15:09:04 launchpad ollama[538108]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140551136841728" timestamp=1742594944 total_threads=16
+Mar 21 15:09:04 launchpad ollama[538108]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36369" tid="140551136841728" timestamp=1742594944
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - type  f32:   65 tensors
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - type q4_0:  225 tensors
+Mar 21 15:09:04 launchpad ollama[1530]: llama_model_loader: - type q6_K:    1 tensors
+Mar 21 15:09:04 launchpad ollama[1530]: time=2025-03-21T15:09:04.499-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_vocab: special tokens cache size = 256
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: arch             = llama
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: vocab type       = BPE
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_vocab          = 128256
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_merges         = 280147
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: vocab_only       = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_embd           = 4096
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_layer          = 32
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_head           = 32
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_head_kv        = 8
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_rot            = 128
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_swa            = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_gqa            = 4
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_ff             = 14336
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_expert         = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_expert_used    = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: causal attn      = 1
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: pooling type     = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: rope type        = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: rope scaling     = linear
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: freq_scale_train = 1
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: ssm_d_state      = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: model type       = 8B
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: model ftype      = Q4_0
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: model params     = 8.03 B
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_print_meta: max token length = 256
+Mar 21 15:09:04 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 21 15:09:04 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 21 15:09:04 launchpad ollama[1530]: ggml_cuda_init: found 1 CUDA devices:
+Mar 21 15:09:04 launchpad ollama[1530]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 21 15:09:04 launchpad ollama[1530]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: n_ctx      = 2048
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: n_batch    = 512
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: n_ubatch   = 512
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: flash_attn = 0
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: freq_scale = 1
+Mar 21 15:09:05 launchpad ollama[1530]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: graph nodes  = 1030
+Mar 21 15:09:05 launchpad ollama[1530]: llama_new_context_with_model: graph splits = 2
+Mar 21 15:09:05 launchpad ollama[538108]: INFO [main] model loaded | tid="140551136841728" timestamp=1742594945
+Mar 21 15:09:05 launchpad ollama[1530]: time=2025-03-21T15:09:05.503-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 21 15:09:13 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:09:13 | 200 |  8.996825431s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:09:13 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:09:13 | 200 |  594.785245ms |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:09:16 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:09:16 | 200 |  2.606560411s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:10:27 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:10:27 | 200 |      34.064µs |       127.0.0.1 | GET      "/api/version"
+Mar 21 15:21:25 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:21:25 | 200 |      521.38µs |       127.0.0.1 | GET      "/api/tags"
+Mar 21 15:22:03 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:22:03 | 200 |     563.204µs |       127.0.0.1 | GET      "/api/tags"
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.036-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=6561005568 required="5.1 GiB"
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.036-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="40.5 GiB" free_swap="68.9 GiB"
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.036-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[6.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="5.1 GiB" memory.required.partial="5.1 GiB" memory.required.kv="256.0 MiB" memory.required.allocations="[5.1 GiB]" memory.weights.total="3.9 GiB" memory.weights.repeating="3.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="258.5 MiB" memory.graph.partial="677.5 MiB"
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.037-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3643862997/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 37513"
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.037-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.037-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.037-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 21 15:22:19 launchpad ollama[540099]: INFO [main] build info | build=0 commit="unknown" tid="140372910104576" timestamp=1742595739
+Mar 21 15:22:19 launchpad ollama[540099]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140372910104576" timestamp=1742595739 total_threads=16
+Mar 21 15:22:19 launchpad ollama[540099]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37513" tid="140372910104576" timestamp=1742595739
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - type  f32:   65 tensors
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - type q4_0:  225 tensors
+Mar 21 15:22:19 launchpad ollama[1530]: llama_model_loader: - type q6_K:    1 tensors
+Mar 21 15:22:19 launchpad ollama[1530]: time=2025-03-21T15:22:19.289-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_vocab: special tokens cache size = 256
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: arch             = llama
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: vocab type       = BPE
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_vocab          = 128256
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_merges         = 280147
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: vocab_only       = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_embd           = 4096
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_layer          = 32
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_head           = 32
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_head_kv        = 8
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_rot            = 128
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_swa            = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_gqa            = 4
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_ff             = 14336
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_expert         = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_expert_used    = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: causal attn      = 1
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: pooling type     = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: rope type        = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: rope scaling     = linear
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: freq_scale_train = 1
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: ssm_d_state      = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: model type       = 8B
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: model ftype      = Q4_0
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: model params     = 8.03 B
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_print_meta: max token length = 256
+Mar 21 15:22:19 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 21 15:22:19 launchpad ollama[1530]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 21 15:22:19 launchpad ollama[1530]: ggml_cuda_init: found 1 CUDA devices:
+Mar 21 15:22:19 launchpad ollama[1530]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 21 15:22:19 launchpad ollama[1530]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 21 15:22:19 launchpad ollama[1530]: llama_new_context_with_model: n_ctx      = 2048
+Mar 21 15:22:19 launchpad ollama[1530]: llama_new_context_with_model: n_batch    = 512
+Mar 21 15:22:19 launchpad ollama[1530]: llama_new_context_with_model: n_ubatch   = 512
+Mar 21 15:22:19 launchpad ollama[1530]: llama_new_context_with_model: flash_attn = 0
+Mar 21 15:22:19 launchpad ollama[1530]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 21 15:22:19 launchpad ollama[1530]: llama_new_context_with_model: freq_scale = 1
+Mar 21 15:22:20 launchpad ollama[1530]: llama_kv_cache_init:      CUDA0 KV buffer size =   256.00 MiB
+Mar 21 15:22:20 launchpad ollama[1530]: llama_new_context_with_model: KV self size  =  256.00 MiB, K (f16):  128.00 MiB, V (f16):  128.00 MiB
+Mar 21 15:22:20 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 21 15:22:20 launchpad ollama[1530]: llama_new_context_with_model:      CUDA0 compute buffer size =   258.50 MiB
+Mar 21 15:22:20 launchpad ollama[1530]: llama_new_context_with_model:  CUDA_Host compute buffer size =    12.01 MiB
+Mar 21 15:22:20 launchpad ollama[1530]: llama_new_context_with_model: graph nodes  = 1030
+Mar 21 15:22:20 launchpad ollama[1530]: llama_new_context_with_model: graph splits = 2
+Mar 21 15:22:20 launchpad ollama[540099]: INFO [main] model loaded | tid="140372910104576" timestamp=1742595740
+Mar 21 15:22:20 launchpad ollama[1530]: time=2025-03-21T15:22:20.292-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 21 15:22:29 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:22:29 | 200 | 10.384552038s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:22:29 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:22:29 | 200 |   673.66211ms |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:22:32 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:22:32 | 200 |  2.666591652s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:24:23 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:24:23 | 200 |  9.822015206s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:24:24 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:24:24 | 200 |  676.470508ms |       127.0.0.1 | POST     "/api/chat"
+Mar 21 15:24:27 launchpad ollama[1530]: [GIN] 2025/03/21 - 15:24:27 | 200 |   3.13406576s |       127.0.0.1 | POST     "/api/chat"
+Mar 21 17:02:44 launchpad ollama[1530]: [GIN] 2025/03/21 - 17:02:44 | 200 |     555.604µs |       127.0.0.1 | GET      "/api/tags"
+Mar 21 17:02:45 launchpad ollama[1530]: [GIN] 2025/03/21 - 17:02:45 | 200 |      43.831µs |       127.0.0.1 | GET      "/api/version"
+Mar 22 11:48:56 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 22 11:48:56 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 22 11:48:56 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 22 11:48:56 launchpad systemd[1]: ollama.service: Consumed 2min 32.909s CPU time, 5.4G memory peak, 48.9M memory swap peak, 4.6G read from disk, 557M written to disk, 487.4K incoming IP traffic, 677.3K outgoing IP traffic.
+-- Boot 87d76a808f644c6e8843c8f6c36724e1 --
+Mar 22 11:49:48 launchpad systemd[1]: Starting Server for local large language models...
+Mar 22 11:49:48 launchpad systemd[1]: Started Server for local large language models.
+Mar 22 11:49:48 launchpad ollama[1530]: 2025/03/22 11:49:48 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 22 11:49:48 launchpad ollama[1530]: time=2025-03-22T11:49:48.898-07:00 level=INFO source=images.go:753 msg="total blobs: 25"
+Mar 22 11:49:48 launchpad ollama[1530]: time=2025-03-22T11:49:48.907-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 22 11:49:48 launchpad ollama[1530]: time=2025-03-22T11:49:48.910-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 22 11:49:48 launchpad ollama[1530]: time=2025-03-22T11:49:48.911-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2384678814/runners
+Mar 22 11:49:51 launchpad ollama[1530]: time=2025-03-22T11:49:51.928-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 22 11:49:51 launchpad ollama[1530]: time=2025-03-22T11:49:51.929-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 22 11:49:51 launchpad ollama[1530]: time=2025-03-22T11:49:51.929-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 22 11:49:51 launchpad ollama[1530]: time=2025-03-22T11:49:51.930-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 22 11:49:51 launchpad ollama[1530]: time=2025-03-22T11:49:51.930-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 22 11:49:52 launchpad ollama[1530]: time=2025-03-22T11:49:52.164-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 22 12:05:09 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 22 12:05:09 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 22 12:05:09 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 22 12:05:09 launchpad systemd[1]: ollama.service: Consumed 3.420s CPU time, 786.6M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot d80e719555084707a7104971b1b682af --
+Mar 22 12:05:40 launchpad systemd[1]: Starting Server for local large language models...
+Mar 22 12:05:40 launchpad systemd[1]: Started Server for local large language models.
+Mar 22 12:05:40 launchpad ollama[1526]: 2025/03/22 12:05:40 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 22 12:05:40 launchpad ollama[1526]: time=2025-03-22T12:05:40.978-07:00 level=INFO source=images.go:753 msg="total blobs: 25"
+Mar 22 12:05:40 launchpad ollama[1526]: time=2025-03-22T12:05:40.985-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 22 12:05:40 launchpad ollama[1526]: time=2025-03-22T12:05:40.987-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 22 12:05:40 launchpad ollama[1526]: time=2025-03-22T12:05:40.988-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1402874388/runners
+Mar 22 12:05:44 launchpad ollama[1526]: time=2025-03-22T12:05:44.011-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 22 12:05:44 launchpad ollama[1526]: time=2025-03-22T12:05:44.012-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 22 12:05:44 launchpad ollama[1526]: time=2025-03-22T12:05:44.012-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 22 12:05:44 launchpad ollama[1526]: time=2025-03-22T12:05:44.012-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 22 12:05:44 launchpad ollama[1526]: time=2025-03-22T12:05:44.012-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 22 12:05:44 launchpad ollama[1526]: time=2025-03-22T12:05:44.252-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 22 18:32:07 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:32:07 | 200 |    3.125817ms |       127.0.0.1 | GET      "/api/tags"
+Mar 22 18:32:08 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:32:08 | 200 |     479.492µs |       127.0.0.1 | GET      "/api/tags"
+Mar 22 18:32:08 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:32:08 | 200 |     361.952µs |       127.0.0.1 | GET      "/api/version"
+Mar 22 18:32:17 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:32:17 | 200 |       24.56µs |       127.0.0.1 | GET      "/api/version"
+Mar 22 18:36:05 launchpad ollama[1526]: time=2025-03-22T18:36:05.995-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="50.2 GiB" free_swap="68.9 GiB"
+Mar 22 18:36:05 launchpad ollama[1526]: time=2025-03-22T18:36:05.995-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=31 layers.split="" memory.available="[7.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.8 GiB" memory.required.partial="7.2 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.2 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 22 18:36:05 launchpad ollama[1526]: time=2025-03-22T18:36:05.997-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1402874388/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 31 --parallel 1 --port 32777"
+Mar 22 18:36:05 launchpad ollama[1526]: time=2025-03-22T18:36:05.997-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 22 18:36:05 launchpad ollama[1526]: time=2025-03-22T18:36:05.997-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 22 18:36:05 launchpad ollama[1526]: time=2025-03-22T18:36:05.997-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 22 18:36:06 launchpad ollama[63432]: INFO [main] build info | build=0 commit="unknown" tid="140560787623936" timestamp=1742693766
+Mar 22 18:36:06 launchpad ollama[63432]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140560787623936" timestamp=1742693766 total_threads=16
+Mar 22 18:36:06 launchpad ollama[63432]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32777" tid="140560787623936" timestamp=1742693766
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - type  f32:   65 tensors
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - type q4_0:  225 tensors
+Mar 22 18:36:06 launchpad ollama[1526]: llama_model_loader: - type q6_K:    1 tensors
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_vocab: special tokens cache size = 256
+Mar 22 18:36:06 launchpad ollama[1526]: time=2025-03-22T18:36:06.248-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: arch             = llama
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: vocab type       = BPE
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_vocab          = 128256
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_merges         = 280147
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: vocab_only       = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_embd           = 4096
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_layer          = 32
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_head           = 32
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_head_kv        = 8
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_rot            = 128
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_swa            = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_gqa            = 4
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_ff             = 14336
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_expert         = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_expert_used    = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: causal attn      = 1
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: pooling type     = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: rope type        = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: rope scaling     = linear
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: freq_scale_train = 1
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: ssm_d_state      = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: model type       = 8B
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: model ftype      = Q4_0
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: model params     = 8.03 B
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_print_meta: max token length = 256
+Mar 22 18:36:06 launchpad ollama[1526]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 22 18:36:06 launchpad ollama[1526]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 22 18:36:06 launchpad ollama[1526]: ggml_cuda_init: found 1 CUDA devices:
+Mar 22 18:36:06 launchpad ollama[1526]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 22 18:36:06 launchpad ollama[1526]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llm_load_tensors: offloading 31 repeating layers to GPU
+Mar 22 18:36:11 launchpad ollama[1526]: llm_load_tensors: offloaded 31/33 layers to GPU
+Mar 22 18:36:11 launchpad ollama[1526]: llm_load_tensors:        CPU buffer size =  4437.80 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llm_load_tensors:      CUDA0 buffer size =  3627.97 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: n_ctx      = 16384
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: n_batch    = 512
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: n_ubatch   = 512
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: flash_attn = 0
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: freq_scale = 1
+Mar 22 18:36:11 launchpad ollama[1526]: llama_kv_cache_init:  CUDA_Host KV buffer size =    64.00 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_kv_cache_init:      CUDA0 KV buffer size =  1984.00 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model:      CUDA0 compute buffer size =  1145.00 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: graph nodes  = 1030
+Mar 22 18:36:11 launchpad ollama[1526]: llama_new_context_with_model: graph splits = 15
+Mar 22 18:36:12 launchpad ollama[63432]: INFO [main] model loaded | tid="140560787623936" timestamp=1742693772
+Mar 22 18:36:12 launchpad ollama[1526]: time=2025-03-22T18:36:12.266-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Mar 22 18:36:31 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:36:31 | 200 | 25.756314363s |       127.0.0.1 | POST     "/api/chat"
+Mar 22 18:36:32 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:36:32 | 200 |  1.130742083s |       127.0.0.1 | POST     "/api/chat"
+Mar 22 18:36:55 launchpad ollama[1526]: [GIN] 2025/03/22 - 18:36:55 | 200 | 22.246422296s |       127.0.0.1 | POST     "/api/chat"
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.111-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="50.1 GiB" free_swap="68.9 GiB"
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.111-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=31 layers.split="" memory.available="[7.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.8 GiB" memory.required.partial="7.2 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.2 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.112-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1402874388/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 31 --parallel 1 --port 42175"
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.112-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.112-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.112-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 22 18:42:59 launchpad ollama[81038]: INFO [main] build info | build=0 commit="unknown" tid="139826444582912" timestamp=1742694179
+Mar 22 18:42:59 launchpad ollama[81038]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139826444582912" timestamp=1742694179 total_threads=16
+Mar 22 18:42:59 launchpad ollama[81038]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42175" tid="139826444582912" timestamp=1742694179
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - type  f32:   65 tensors
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - type q4_0:  225 tensors
+Mar 22 18:42:59 launchpad ollama[1526]: llama_model_loader: - type q6_K:    1 tensors
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_vocab: special tokens cache size = 256
+Mar 22 18:42:59 launchpad ollama[1526]: time=2025-03-22T18:42:59.363-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: arch             = llama
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: vocab type       = BPE
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_vocab          = 128256
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_merges         = 280147
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: vocab_only       = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_embd           = 4096
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_layer          = 32
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_head           = 32
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_head_kv        = 8
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_rot            = 128
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_swa            = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_gqa            = 4
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_ff             = 14336
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_expert         = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_expert_used    = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: causal attn      = 1
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: pooling type     = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: rope type        = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: rope scaling     = linear
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: freq_scale_train = 1
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: ssm_d_state      = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: model type       = 8B
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: model ftype      = Q4_0
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: model params     = 8.03 B
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_print_meta: max token length = 256
+Mar 22 18:42:59 launchpad ollama[1526]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 22 18:42:59 launchpad ollama[1526]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 22 18:42:59 launchpad ollama[1526]: ggml_cuda_init: found 1 CUDA devices:
+Mar 22 18:42:59 launchpad ollama[1526]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_tensors: offloading 31 repeating layers to GPU
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_tensors: offloaded 31/33 layers to GPU
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_tensors:        CPU buffer size =  4437.80 MiB
+Mar 22 18:42:59 launchpad ollama[1526]: llm_load_tensors:      CUDA0 buffer size =  3627.97 MiB
+Mar 22 18:42:59 launchpad ollama[1526]: llama_new_context_with_model: n_ctx      = 16384
+Mar 22 18:42:59 launchpad ollama[1526]: llama_new_context_with_model: n_batch    = 512
+Mar 22 18:42:59 launchpad ollama[1526]: llama_new_context_with_model: n_ubatch   = 512
+Mar 22 18:42:59 launchpad ollama[1526]: llama_new_context_with_model: flash_attn = 0
+Mar 22 18:42:59 launchpad ollama[1526]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 22 18:42:59 launchpad ollama[1526]: llama_new_context_with_model: freq_scale = 1
+Mar 22 18:43:00 launchpad ollama[1526]: llama_kv_cache_init:  CUDA_Host KV buffer size =    64.00 MiB
+Mar 22 18:43:00 launchpad ollama[1526]: llama_kv_cache_init:      CUDA0 KV buffer size =  1984.00 MiB
+Mar 22 18:43:00 launchpad ollama[1526]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 22 18:43:00 launchpad ollama[1526]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 22 18:43:00 launchpad ollama[1526]: llama_new_context_with_model:      CUDA0 compute buffer size =  1145.00 MiB
+Mar 22 18:43:00 launchpad ollama[1526]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 22 18:43:00 launchpad ollama[1526]: llama_new_context_with_model: graph nodes  = 1030
+Mar 22 18:43:00 launchpad ollama[1526]: llama_new_context_with_model: graph splits = 15
+Mar 22 18:43:00 launchpad ollama[81038]: INFO [main] model loaded | tid="139826444582912" timestamp=1742694180
+Mar 22 18:43:00 launchpad ollama[1526]: time=2025-03-22T18:43:00.124-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 22 19:01:30 launchpad ollama[1526]: [GIN] 2025/03/22 - 19:01:30 | 200 |        18m32s |       127.0.0.1 | POST     "/api/chat"
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.627-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="49.8 GiB" free_swap="68.9 GiB"
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.628-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=31 layers.split="" memory.available="[7.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.8 GiB" memory.required.partial="7.2 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.2 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.629-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1402874388/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 31 --parallel 1 --port 45027"
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.629-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.629-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.629-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 23 11:26:08 launchpad ollama[354272]: INFO [main] build info | build=0 commit="unknown" tid="139888281346048" timestamp=1742754368
+Mar 23 11:26:08 launchpad ollama[354272]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139888281346048" timestamp=1742754368 total_threads=16
+Mar 23 11:26:08 launchpad ollama[354272]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45027" tid="139888281346048" timestamp=1742754368
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - type  f32:   65 tensors
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - type q4_0:  225 tensors
+Mar 23 11:26:08 launchpad ollama[1526]: llama_model_loader: - type q6_K:    1 tensors
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_vocab: special tokens cache size = 256
+Mar 23 11:26:08 launchpad ollama[1526]: time=2025-03-23T11:26:08.880-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: arch             = llama
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: vocab type       = BPE
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_vocab          = 128256
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_merges         = 280147
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: vocab_only       = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_embd           = 4096
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_layer          = 32
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_head           = 32
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_head_kv        = 8
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_rot            = 128
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_swa            = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_gqa            = 4
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_ff             = 14336
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_expert         = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_expert_used    = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: causal attn      = 1
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: pooling type     = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: rope type        = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: rope scaling     = linear
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: freq_scale_train = 1
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: ssm_d_state      = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: model type       = 8B
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: model ftype      = Q4_0
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: model params     = 8.03 B
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 23 11:26:08 launchpad ollama[1526]: llm_load_print_meta: max token length = 256
+Mar 23 11:26:08 launchpad ollama[1526]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 23 11:26:08 launchpad ollama[1526]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 23 11:26:08 launchpad ollama[1526]: ggml_cuda_init: found 1 CUDA devices:
+Mar 23 11:26:08 launchpad ollama[1526]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 23 11:26:09 launchpad ollama[1526]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llm_load_tensors: offloading 31 repeating layers to GPU
+Mar 23 11:26:09 launchpad ollama[1526]: llm_load_tensors: offloaded 31/33 layers to GPU
+Mar 23 11:26:09 launchpad ollama[1526]: llm_load_tensors:        CPU buffer size =  4437.80 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llm_load_tensors:      CUDA0 buffer size =  3627.97 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: n_ctx      = 16384
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: n_batch    = 512
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: n_ubatch   = 512
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: flash_attn = 0
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: freq_scale = 1
+Mar 23 11:26:09 launchpad ollama[1526]: llama_kv_cache_init:  CUDA_Host KV buffer size =    64.00 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_kv_cache_init:      CUDA0 KV buffer size =  1984.00 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model:      CUDA0 compute buffer size =  1145.00 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: graph nodes  = 1030
+Mar 23 11:26:09 launchpad ollama[1526]: llama_new_context_with_model: graph splits = 15
+Mar 23 11:26:09 launchpad ollama[354272]: INFO [main] model loaded | tid="139888281346048" timestamp=1742754369
+Mar 23 11:26:09 launchpad ollama[1526]: time=2025-03-23T11:26:09.633-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.00 seconds"
+Mar 23 11:26:12 launchpad ollama[1526]: [GIN] 2025/03/23 - 11:26:12 | 200 |  4.108947061s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 11:47:25 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 29 11:47:25 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 29 11:47:25 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 29 11:47:25 launchpad systemd[1]: ollama.service: Consumed 1h 24min 53.790s CPU time, 5.5G memory peak, 4.5G read from disk, 508.1M written to disk, 4.8M incoming IP traffic, 7.2M outgoing IP traffic.
+-- Boot bf78be1ece36430ea50ae36db80fd028 --
+Mar 29 11:47:58 launchpad systemd[1]: Starting Server for local large language models...
+Mar 29 11:47:58 launchpad systemd[1]: Started Server for local large language models.
+Mar 29 11:47:58 launchpad ollama[1512]: 2025/03/29 11:47:58 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 29 11:47:58 launchpad ollama[1512]: time=2025-03-29T11:47:58.329-07:00 level=INFO source=images.go:753 msg="total blobs: 25"
+Mar 29 11:47:58 launchpad ollama[1512]: time=2025-03-29T11:47:58.339-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 29 11:47:58 launchpad ollama[1512]: time=2025-03-29T11:47:58.341-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 29 11:47:58 launchpad ollama[1512]: time=2025-03-29T11:47:58.344-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2352317378/runners
+Mar 29 11:48:01 launchpad ollama[1512]: time=2025-03-29T11:48:01.346-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 29 11:48:01 launchpad ollama[1512]: time=2025-03-29T11:48:01.347-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 29 11:48:01 launchpad ollama[1512]: time=2025-03-29T11:48:01.347-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 29 11:48:01 launchpad ollama[1512]: time=2025-03-29T11:48:01.347-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 29 11:48:01 launchpad ollama[1512]: time=2025-03-29T11:48:01.347-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 29 11:48:01 launchpad ollama[1512]: time=2025-03-29T11:48:01.624-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.3 GiB"
+Mar 29 11:49:26 launchpad systemd[1]: Stopping Server for local large language models...
+Mar 29 11:49:26 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Mar 29 11:49:26 launchpad systemd[1]: Stopped Server for local large language models.
+Mar 29 11:49:26 launchpad systemd[1]: ollama.service: Consumed 3.432s CPU time, 787M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot cf2f988c42ce41d9aeb1a1f4e09a64af --
+Mar 29 11:49:57 launchpad systemd[1]: Starting Server for local large language models...
+Mar 29 11:49:57 launchpad systemd[1]: Started Server for local large language models.
+Mar 29 11:49:58 launchpad ollama[1510]: 2025/03/29 11:49:58 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Mar 29 11:49:58 launchpad ollama[1510]: time=2025-03-29T11:49:58.011-07:00 level=INFO source=images.go:753 msg="total blobs: 25"
+Mar 29 11:49:58 launchpad ollama[1510]: time=2025-03-29T11:49:58.020-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Mar 29 11:49:58 launchpad ollama[1510]: time=2025-03-29T11:49:58.022-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Mar 29 11:49:58 launchpad ollama[1510]: time=2025-03-29T11:49:58.024-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1630742596/runners
+Mar 29 11:50:01 launchpad ollama[1510]: time=2025-03-29T11:50:01.071-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Mar 29 11:50:01 launchpad ollama[1510]: time=2025-03-29T11:50:01.072-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Mar 29 11:50:01 launchpad ollama[1510]: time=2025-03-29T11:50:01.072-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 29 11:50:01 launchpad ollama[1510]: time=2025-03-29T11:50:01.073-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 29 11:50:01 launchpad ollama[1510]: time=2025-03-29T11:50:01.073-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Mar 29 11:50:01 launchpad ollama[1510]: time=2025-03-29T11:50:01.301-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Mar 29 12:06:57 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:06:57 | 200 |     909.428µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:07:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:03 | 200 |      46.264µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 29 12:07:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:03 | 200 |   26.073261ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 12:07:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:11 | 200 |      15.595µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:07:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:11 | 200 |    3.796966ms |       127.0.0.1 | GET      "/api/tags"
+Mar 29 12:07:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:31 | 200 |      25.537µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:07:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:31 | 200 |    2.925078ms |       127.0.0.1 | DELETE   "/api/delete"
+Mar 29 12:07:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:40 | 200 |      21.429µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:07:42 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:42 | 200 |      56.883µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 29 12:07:42 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:42 | 200 |   26.024146ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 12:07:45 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:45 | 200 |      15.269µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:07:45 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:07:45 | 200 |     496.543µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 12:11:45 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:11:45 | 200 |     423.031µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 12:11:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:11:46 | 200 |     457.267µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 12:11:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:11:46 | 200 |      31.581µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 12:12:06 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:12:06 | 200 |      48.021µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.934-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10215030784 required="7.7 GiB"
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.935-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.935-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.936-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 40787"
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.936-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.936-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 12:12:18 launchpad ollama[1510]: time=2025-03-29T12:12:18.937-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 12:12:18 launchpad ollama[8895]: INFO [main] build info | build=0 commit="unknown" tid="139713714561024" timestamp=1743275538
+Mar 29 12:12:18 launchpad ollama[8895]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139713714561024" timestamp=1743275538 total_threads=16
+Mar 29 12:12:18 launchpad ollama[8895]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40787" tid="139713714561024" timestamp=1743275538
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 12:12:18 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 12:12:19 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 12:12:19 launchpad ollama[1510]: time=2025-03-29T12:12:19.187-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 12:12:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 12:12:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 12:12:19 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 12:12:19 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 12:12:19 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 12:12:19 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 12:12:20 launchpad ollama[8895]: INFO [main] model loaded | tid="139713714561024" timestamp=1743275540
+Mar 29 12:12:20 launchpad ollama[1510]: time=2025-03-29T12:12:20.191-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 12:12:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:12:23 | 200 |  4.897748354s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:12:24 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:12:24 | 200 |   637.32604ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:12:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:12:27 | 200 |   3.19952118s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:13:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:13:14 | 200 |  4.952539512s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:15:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:15:18 | 200 |  6.665898331s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:17:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:17:09 | 200 |  5.759665031s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:18:52 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:18:52 | 200 |  7.221153274s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:20:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:20:30 | 200 |  6.551018819s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:23:24 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:23:24 | 200 |  4.737337823s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:25:07 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:25:07 | 200 |  3.491567625s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:26:15 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:26:15 | 200 |  7.677028492s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:28:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:28:38 | 200 | 11.999277099s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:33:45 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:33:45 | 200 | 10.595503222s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:35:06 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:35:06 | 200 | 11.251535252s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.813-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10224271360 required="7.7 GiB"
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.813-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.4 GiB" free_swap="68.9 GiB"
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.813-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.814-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 46297"
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.814-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.814-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 12:42:13 launchpad ollama[1510]: time=2025-03-29T12:42:13.814-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 12:42:13 launchpad ollama[13652]: INFO [main] build info | build=0 commit="unknown" tid="140134671540224" timestamp=1743277333
+Mar 29 12:42:13 launchpad ollama[13652]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140134671540224" timestamp=1743277333 total_threads=16
+Mar 29 12:42:13 launchpad ollama[13652]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46297" tid="140134671540224" timestamp=1743277333
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 12:42:13 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 12:42:14 launchpad ollama[1510]: time=2025-03-29T12:42:14.066-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 12:42:14 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 12:42:14 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 12:42:14 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 12:42:14 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 12:42:14 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 12:42:14 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 12:42:14 launchpad ollama[13652]: INFO [main] model loaded | tid="140134671540224" timestamp=1743277334
+Mar 29 12:42:15 launchpad ollama[1510]: time=2025-03-29T12:42:15.069-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 12:42:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:42:26 | 200 | 12.993540226s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:43:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:43:11 | 200 |  8.164448502s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:43:58 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:43:58 | 200 |      15.023µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:44:01 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:44:01 | 200 |      45.372µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 29 12:44:01 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:44:01 | 200 |   24.951458ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 12:44:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:44:14 | 200 |      16.158µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:44:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:44:14 | 200 |      67.418µs |       127.0.0.1 | GET      "/api/ps"
+Mar 29 12:44:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:44:36 | 200 |      17.009µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:44:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:44:36 | 200 |      431.12µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 12:45:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:12 | 200 |       15.25µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:45:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:12 | 200 |     350.332µs |       127.0.0.1 | POST     "/api/generate"
+Mar 29 12:45:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:16 | 200 |      15.152µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:45:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:16 | 200 |       7.923µs |       127.0.0.1 | GET      "/api/ps"
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.092-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10264117248 required="7.7 GiB"
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.092-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.3 GiB" free_swap="68.9 GiB"
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.093-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.093-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 43959"
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.094-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.094-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.094-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 12:45:27 launchpad ollama[14407]: INFO [main] build info | build=0 commit="unknown" tid="140403551617024" timestamp=1743277527
+Mar 29 12:45:27 launchpad ollama[14407]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140403551617024" timestamp=1743277527 total_threads=16
+Mar 29 12:45:27 launchpad ollama[14407]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43959" tid="140403551617024" timestamp=1743277527
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 12:45:27 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 12:45:27 launchpad ollama[1510]: time=2025-03-29T12:45:27.345-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 12:45:27 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 12:45:27 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 12:45:27 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 12:45:27 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 12:45:27 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 12:45:28 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 12:45:28 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 12:45:28 launchpad ollama[14407]: INFO [main] model loaded | tid="140403551617024" timestamp=1743277528
+Mar 29 12:45:28 launchpad ollama[1510]: time=2025-03-29T12:45:28.349-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 12:45:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:31 | 200 |      17.212µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:45:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:31 | 200 |      18.745µs |       127.0.0.1 | GET      "/api/ps"
+Mar 29 12:45:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:45:40 | 200 | 13.878557309s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:46:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:46:17 | 200 |  4.925003725s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:47:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:47:30 | 200 |      19.517µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:47:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:47:30 | 200 |   12.998916ms |       127.0.0.1 | POST     "/api/show"
+Mar 29 12:48:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:48:23 | 200 |      25.343µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 12:48:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:48:27 | 200 |      25.454µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 12:49:15 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:49:15 | 200 |  5.394766361s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:49:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:49:18 | 200 |  7.810179363s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:50:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:50:33 | 200 |  7.107549148s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:51:41 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:51:41 | 200 |      14.957µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:51:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:51:43 | 200 |      49.621µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 29 12:51:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:51:43 | 200 |   25.094074ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 12:51:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:51:46 | 200 |      15.517µs |       127.0.0.1 | HEAD     "/"
+Mar 29 12:51:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:51:46 | 200 |   13.371083ms |       127.0.0.1 | POST     "/api/show"
+Mar 29 12:52:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:52:14 | 200 | 11.649019777s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:53:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:53:18 | 200 |  5.781659817s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:54:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:54:26 | 200 |  5.181279914s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:55:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:55:11 | 200 |  5.992279574s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:56:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:56:43 | 200 |  9.127719573s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:57:52 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:57:52 | 200 | 12.860902242s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 12:59:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 12:59:03 | 200 |  8.892812668s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:00:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:00:23 | 200 | 10.323785591s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.990-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10266214400 required="7.7 GiB"
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.990-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.5 GiB" free_swap="68.9 GiB"
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.990-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.991-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 40401"
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.991-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.991-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 13:05:39 launchpad ollama[1510]: time=2025-03-29T13:05:39.991-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 13:05:40 launchpad ollama[17589]: INFO [main] build info | build=0 commit="unknown" tid="140078037950464" timestamp=1743278740
+Mar 29 13:05:40 launchpad ollama[17589]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140078037950464" timestamp=1743278740 total_threads=16
+Mar 29 13:05:40 launchpad ollama[17589]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40401" tid="140078037950464" timestamp=1743278740
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 13:05:40 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 13:05:40 launchpad ollama[1510]: time=2025-03-29T13:05:40.242-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 13:05:40 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 13:05:40 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 13:05:40 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 13:05:40 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 13:05:40 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 13:05:40 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 13:05:41 launchpad ollama[17589]: INFO [main] model loaded | tid="140078037950464" timestamp=1743278741
+Mar 29 13:05:41 launchpad ollama[1510]: time=2025-03-29T13:05:41.247-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 13:05:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:05:59 | 200 | 19.995767094s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:07:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:07:26 | 200 |  8.150352463s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:08:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:08:03 | 200 | 10.703657875s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:09:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:09:36 | 200 |  9.706439772s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:10:55 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:10:55 | 200 |  4.229398164s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:11:50 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:11:50 | 200 |  3.435489866s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:12:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:12:03 | 200 |  3.636389468s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:12:22 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:12:22 | 200 |  4.237886728s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:12:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:12:34 | 200 |  3.627417048s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:12:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:12:49 | 200 |  3.926222454s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:13:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:13:03 | 200 |  4.159111102s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:14:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:14:20 | 200 |  3.896558089s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:14:44 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:14:44 | 200 |      15.207µs |       127.0.0.1 | HEAD     "/"
+Mar 29 13:14:44 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:14:44 | 200 |     265.723µs |       127.0.0.1 | POST     "/api/generate"
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.252-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10247340032 required="7.7 GiB"
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.252-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.5 GiB" free_swap="68.9 GiB"
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.253-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.254-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 36405"
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.254-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.254-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.254-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 13:14:48 launchpad ollama[18950]: INFO [main] build info | build=0 commit="unknown" tid="140561416552448" timestamp=1743279288
+Mar 29 13:14:48 launchpad ollama[18950]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140561416552448" timestamp=1743279288 total_threads=16
+Mar 29 13:14:48 launchpad ollama[18950]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36405" tid="140561416552448" timestamp=1743279288
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 13:14:48 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 13:14:48 launchpad ollama[1510]: time=2025-03-29T13:14:48.505-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 13:14:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 13:14:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 13:14:48 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 13:14:48 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 13:14:48 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 13:14:49 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 13:14:49 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 13:14:49 launchpad ollama[18950]: INFO [main] model loaded | tid="140561416552448" timestamp=1743279289
+Mar 29 13:14:49 launchpad ollama[1510]: time=2025-03-29T13:14:49.508-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 13:15:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:15:03 | 200 | 15.600965044s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:15:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:15:14 | 200 |  3.858068645s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:15:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:15:20 | 200 |  1.589176928s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:15:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:15:30 | 200 | 14.443002461s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:16:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:16:17 | 200 |  4.031074324s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:18:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:18:14 | 200 |  9.958618766s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:18:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:18:14 | 200 |  803.635398ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:18:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:18:16 | 200 |    1.2515995s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:23:16 launchpad ollama[1510]: cuda driver library failed to get device context 2time=2025-03-29T13:23:16.207-07:00 level=WARN source=gpu.go:400 msg="error looking up nvidia GPU memory"
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.613-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9938206720 required="7.7 GiB"
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.613-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="46.5 GiB" free_swap="68.9 GiB"
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.613-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.614-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 39459"
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.614-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.615-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.615-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 13:27:09 launchpad ollama[22700]: INFO [main] build info | build=0 commit="unknown" tid="140245866528768" timestamp=1743280029
+Mar 29 13:27:09 launchpad ollama[22700]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140245866528768" timestamp=1743280029 total_threads=16
+Mar 29 13:27:09 launchpad ollama[22700]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39459" tid="140245866528768" timestamp=1743280029
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 13:27:09 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 13:27:09 launchpad ollama[1510]: time=2025-03-29T13:27:09.866-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 13:27:09 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 13:27:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 13:27:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 13:27:09 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 13:27:09 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 13:27:10 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 13:27:10 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 13:27:10 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 13:27:10 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 13:27:10 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 13:27:10 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 13:27:10 launchpad ollama[22700]: INFO [main] model loaded | tid="140245866528768" timestamp=1743280030
+Mar 29 13:27:10 launchpad ollama[1510]: time=2025-03-29T13:27:10.869-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 13:27:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:27:16 | 200 |  7.512050236s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:32:17 launchpad ollama[1510]: cuda driver library failed to get device context 2time=2025-03-29T13:32:17.056-07:00 level=WARN source=gpu.go:400 msg="error looking up nvidia GPU memory"
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.761-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9974120448 required="7.7 GiB"
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.761-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="45.0 GiB" free_swap="68.9 GiB"
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.762-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.763-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 39893"
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.763-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.763-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 13:34:09 launchpad ollama[1510]: time=2025-03-29T13:34:09.763-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 13:34:09 launchpad ollama[24583]: INFO [main] build info | build=0 commit="unknown" tid="140205579079680" timestamp=1743280449
+Mar 29 13:34:09 launchpad ollama[24583]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140205579079680" timestamp=1743280449 total_threads=16
+Mar 29 13:34:09 launchpad ollama[24583]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39893" tid="140205579079680" timestamp=1743280449
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 13:34:09 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 13:34:10 launchpad ollama[1510]: time=2025-03-29T13:34:10.015-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 13:34:10 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 13:34:10 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 13:34:10 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 13:34:10 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 13:34:10 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 13:34:10 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 13:34:10 launchpad ollama[24583]: INFO [main] model loaded | tid="140205579079680" timestamp=1743280450
+Mar 29 13:34:11 launchpad ollama[1510]: time=2025-03-29T13:34:11.018-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 13:34:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:34:17 | 200 |  7.682613172s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:36:08 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:36:08 | 200 |      15.045µs |       127.0.0.1 | HEAD     "/"
+Mar 29 13:36:11 launchpad ollama[1510]: time=2025-03-29T13:36:11.365-07:00 level=INFO source=download.go:175 msg="downloading 6340dc3229b0 in 16 307 MB part(s)"
+Mar 29 13:38:11 launchpad ollama[1510]: time=2025-03-29T13:38:11.275-07:00 level=INFO source=download.go:175 msg="downloading 369ca498f347 in 1 387 B part(s)"
+Mar 29 13:38:12 launchpad ollama[1510]: time=2025-03-29T13:38:12.797-07:00 level=INFO source=download.go:175 msg="downloading 6e4c38e1172f in 1 1.1 KB part(s)"
+Mar 29 13:38:14 launchpad ollama[1510]: time=2025-03-29T13:38:14.177-07:00 level=INFO source=download.go:175 msg="downloading f4d24e9138dd in 1 148 B part(s)"
+Mar 29 13:38:15 launchpad ollama[1510]: time=2025-03-29T13:38:15.486-07:00 level=INFO source=download.go:175 msg="downloading 0cb05c6e4e02 in 1 487 B part(s)"
+Mar 29 13:38:19 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:38:19 | 200 |         2m10s |       127.0.0.1 | POST     "/api/pull"
+Mar 29 13:39:17 launchpad ollama[1510]: cuda driver library failed to get device context 2time=2025-03-29T13:39:17.386-07:00 level=WARN source=gpu.go:400 msg="error looking up nvidia GPU memory"
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.035-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9946005504 required="7.7 GiB"
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.035-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="45.1 GiB" free_swap="68.9 GiB"
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.035-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.036-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 34835"
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.036-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.037-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.037-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 13:55:23 launchpad ollama[30258]: INFO [main] build info | build=0 commit="unknown" tid="139740411109376" timestamp=1743281723
+Mar 29 13:55:23 launchpad ollama[30258]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139740411109376" timestamp=1743281723 total_threads=16
+Mar 29 13:55:23 launchpad ollama[30258]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34835" tid="139740411109376" timestamp=1743281723
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 13:55:23 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 13:55:23 launchpad ollama[1510]: time=2025-03-29T13:55:23.288-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 13:55:23 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 13:55:23 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 13:55:23 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 13:55:23 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 13:55:23 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 13:55:23 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 13:55:24 launchpad ollama[30258]: INFO [main] model loaded | tid="139740411109376" timestamp=1743281724
+Mar 29 13:55:24 launchpad ollama[1510]: time=2025-03-29T13:55:24.292-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 13:55:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:55:31 | 200 |   8.59984653s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 13:57:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:57:17 | 200 |      16.399µs |       127.0.0.1 | HEAD     "/"
+Mar 29 13:57:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:57:17 | 200 |     525.548µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 13:58:07 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:58:07 | 200 |      15.609µs |       127.0.0.1 | HEAD     "/"
+Mar 29 13:58:07 launchpad ollama[1510]: [GIN] 2025/03/29 - 13:58:07 | 200 |   12.559389ms |       127.0.0.1 | POST     "/api/show"
+Mar 29 14:12:56 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:12:56 | 200 |      14.583µs |       127.0.0.1 | HEAD     "/"
+Mar 29 14:12:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:12:59 | 200 |      44.611µs |       127.0.0.1 | POST     "/api/blobs/sha256:6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be"
+Mar 29 14:12:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:12:59 | 200 |   69.446199ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 14:14:01 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:01 | 200 |      15.583µs |       127.0.0.1 | HEAD     "/"
+Mar 29 14:14:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:03 | 200 |      51.094µs |       127.0.0.1 | POST     "/api/blobs/sha256:6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa"
+Mar 29 14:14:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:03 | 200 |   24.847556ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 14:14:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:20 | 200 |      15.174µs |       127.0.0.1 | HEAD     "/"
+Mar 29 14:14:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:23 | 200 |      43.754µs |       127.0.0.1 | POST     "/api/blobs/sha256:6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be"
+Mar 29 14:14:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:23 | 200 |   69.143041ms |       127.0.0.1 | POST     "/api/create"
+Mar 29 14:14:48 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:48 | 200 |      15.822µs |       127.0.0.1 | HEAD     "/"
+Mar 29 14:14:48 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:14:48 | 200 |     618.311µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 14:15:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:15:20 | 200 |      26.491µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 14:16:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:16:20 | 200 |     629.155µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 14:16:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:16:20 | 200 |      38.872µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.335-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9934471168 required="6.5 GiB"
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.335-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="45.0 GiB" free_swap="68.9 GiB"
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.335-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.337-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39699"
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.337-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.337-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.337-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 14:18:09 launchpad ollama[36827]: INFO [main] build info | build=0 commit="unknown" tid="140627769352192" timestamp=1743283089
+Mar 29 14:18:09 launchpad ollama[36827]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140627769352192" timestamp=1743283089 total_threads=16
+Mar 29 14:18:09 launchpad ollama[36827]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39699" tid="140627769352192" timestamp=1743283089
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 14:18:09 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 14:18:09 launchpad ollama[1510]: time=2025-03-29T14:18:09.587-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 14:18:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 14:18:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 14:18:09 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 14:18:09 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 14:18:09 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 14:18:10 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 14:18:10 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 14:18:10 launchpad ollama[36827]: INFO [main] model loaded | tid="140627769352192" timestamp=1743283090
+Mar 29 14:18:10 launchpad ollama[1510]: time=2025-03-29T14:18:10.591-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 14:18:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:18:30 | 200 | 21.477310996s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:18:32 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:18:32 | 200 |  1.388959098s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:18:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:18:33 | 200 |  1.547988848s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:23:54 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:23:54 | 200 |      33.383µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.795-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9930014720 required="6.5 GiB"
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.795-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="44.8 GiB" free_swap="68.9 GiB"
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.795-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.797-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34413"
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.797-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.797-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 14:24:00 launchpad ollama[1510]: time=2025-03-29T14:24:00.797-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 14:24:00 launchpad ollama[38425]: INFO [main] build info | build=0 commit="unknown" tid="139833322168320" timestamp=1743283440
+Mar 29 14:24:00 launchpad ollama[38425]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139833322168320" timestamp=1743283440 total_threads=16
+Mar 29 14:24:00 launchpad ollama[38425]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34413" tid="139833322168320" timestamp=1743283440
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 14:24:00 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 14:24:01 launchpad ollama[1510]: time=2025-03-29T14:24:01.048-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 14:24:01 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 14:24:01 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 14:24:01 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 14:24:01 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 14:24:01 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 14:24:01 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 14:24:01 launchpad ollama[38425]: INFO [main] model loaded | tid="139833322168320" timestamp=1743283441
+Mar 29 14:24:02 launchpad ollama[1510]: time=2025-03-29T14:24:02.052-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 14:24:21 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:24:21 | 200 | 21.095573632s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:46:25 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:46:25 | 200 |      16.151µs |       127.0.0.1 | HEAD     "/"
+Mar 29 14:46:25 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:46:25 | 200 |      11.351µs |       127.0.0.1 | GET      "/api/ps"
+Mar 29 14:48:21 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:48:21 | 200 |     529.568µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 14:50:37 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:50:37 | 200 |     628.376µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 14:50:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:50:47 | 200 |      28.219µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 14:51:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:51:17 | 200 |      25.524µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 14:51:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:51:36 | 200 |       25.05µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.112-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8687517696 required="6.5 GiB"
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.112-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="43.8 GiB" free_swap="68.9 GiB"
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.112-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.113-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43335"
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.114-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.114-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.114-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 14:52:04 launchpad ollama[46555]: INFO [main] build info | build=0 commit="unknown" tid="139661138759680" timestamp=1743285124
+Mar 29 14:52:04 launchpad ollama[46555]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139661138759680" timestamp=1743285124 total_threads=16
+Mar 29 14:52:04 launchpad ollama[46555]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43335" tid="139661138759680" timestamp=1743285124
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 14:52:04 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 14:52:04 launchpad ollama[1510]: time=2025-03-29T14:52:04.365-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 14:52:04 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 14:52:04 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 14:52:04 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 14:52:04 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 14:52:04 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 14:52:05 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 14:52:05 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 14:52:05 launchpad ollama[46555]: INFO [main] model loaded | tid="139661138759680" timestamp=1743285125
+Mar 29 14:52:05 launchpad ollama[1510]: time=2025-03-29T14:52:05.367-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 14:52:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:52:16 | 200 | 12.229921912s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:52:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:52:16 | 200 |  675.074849ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:52:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:52:17 | 200 |  999.092403ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 14:57:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 14:57:11 | 200 |     563.021µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:03:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:03:23 | 200 |     548.489µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:03:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:03:23 | 200 |      29.257µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:04:58 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:04:58 | 200 |      26.645µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:04:58 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:04:58 | 200 |     541.141µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:04:58 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:04:58 | 200 |     969.124µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:05:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:05:11 | 200 |      25.431µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:06:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:06:46 | 200 |     615.367µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:06:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:06:46 | 200 |      29.462µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:08:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:08:23 | 200 |      30.071µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:08:44 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:08:44 | 200 |      26.805µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.780-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8990490624 required="6.5 GiB"
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.780-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="37.2 GiB" free_swap="68.9 GiB"
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.780-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.781-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33433"
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.782-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.782-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 15:09:16 launchpad ollama[1510]: time=2025-03-29T15:09:16.782-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 15:09:16 launchpad ollama[51850]: INFO [main] build info | build=0 commit="unknown" tid="140565074239488" timestamp=1743286156
+Mar 29 15:09:16 launchpad ollama[51850]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140565074239488" timestamp=1743286156 total_threads=16
+Mar 29 15:09:16 launchpad ollama[51850]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33433" tid="140565074239488" timestamp=1743286156
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 15:09:16 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 15:09:17 launchpad ollama[1510]: time=2025-03-29T15:09:17.032-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 15:09:17 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 15:09:17 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 15:09:17 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 15:09:17 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 15:09:17 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 15:09:17 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 15:09:17 launchpad ollama[51850]: INFO [main] model loaded | tid="140565074239488" timestamp=1743286157
+Mar 29 15:09:18 launchpad ollama[1510]: time=2025-03-29T15:09:18.036-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 15:09:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:09:18 | 200 |  2.202793672s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:09:39 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:09:39 | 200 | 12.393429208s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:09:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:09:40 | 200 |  767.179528ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:09:41 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:09:41 | 200 |  1.098916675s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:10:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:10:38 | 200 |     575.915µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:10:39 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:10:39 | 200 |      26.345µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:12:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:12:27 | 200 |     578.327µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:12:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:12:31 | 200 |      27.439µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:14:02 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:14:02 | 200 |     522.816µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:14:56 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:14:56 | 200 |      27.209µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.557-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8989310976 required="6.5 GiB"
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.557-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="36.9 GiB" free_swap="68.9 GiB"
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.557-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.558-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 32929"
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.558-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.558-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.559-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 15:15:09 launchpad ollama[53962]: INFO [main] build info | build=0 commit="unknown" tid="139621333716992" timestamp=1743286509
+Mar 29 15:15:09 launchpad ollama[53962]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139621333716992" timestamp=1743286509 total_threads=16
+Mar 29 15:15:09 launchpad ollama[53962]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32929" tid="139621333716992" timestamp=1743286509
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 15:15:09 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 15:15:09 launchpad ollama[1510]: time=2025-03-29T15:15:09.809-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 15:15:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 15:15:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 15:15:09 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 15:15:09 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 15:15:09 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 15:15:10 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 15:15:10 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 15:15:10 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 15:15:10 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 15:15:10 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 15:15:10 launchpad ollama[53962]: INFO [main] model loaded | tid="139621333716992" timestamp=1743286510
+Mar 29 15:15:10 launchpad ollama[1510]: time=2025-03-29T15:15:10.812-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 15:15:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:15:18 | 200 |  8.684809515s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:15:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:15:18 | 200 |  630.645845ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:15:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:15:20 | 200 |  1.754691555s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:16:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:16:40 | 200 |  9.252340304s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:17:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:17:34 | 200 |  8.512406778s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:18:21 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:18:21 | 200 |     550.079µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:18:24 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:18:24 | 200 |      25.383µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:18:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:18:33 | 200 |  4.758009566s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:18:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:18:33 | 200 |  380.581024ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:18:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:18:34 | 200 |  717.507959ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:19:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:19:31 | 200 |  7.114769033s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:20:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:20:09 | 200 |  1.248733669s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:20:22 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:20:22 | 200 | 10.979411906s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:21:10 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:21:10 | 200 |     614.951µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:32:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:32:47 | 200 |     527.521µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:33:10 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:33:10 | 200 |     536.619µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:33:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:33:13 | 200 |      27.247µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.472-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9001500672 required="6.5 GiB"
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.472-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="37.6 GiB" free_swap="68.9 GiB"
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.472-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.473-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38149"
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.473-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.473-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.474-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 15:33:33 launchpad ollama[58902]: INFO [main] build info | build=0 commit="unknown" tid="139883615182848" timestamp=1743287613
+Mar 29 15:33:33 launchpad ollama[58902]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139883615182848" timestamp=1743287613 total_threads=16
+Mar 29 15:33:33 launchpad ollama[58902]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38149" tid="139883615182848" timestamp=1743287613
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 15:33:33 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 15:33:33 launchpad ollama[1510]: time=2025-03-29T15:33:33.725-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 15:33:33 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 15:33:33 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 15:33:33 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 15:33:33 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 15:33:33 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 15:33:34 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 15:33:34 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 15:33:34 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 15:33:34 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 15:33:34 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 15:33:34 launchpad ollama[58902]: INFO [main] model loaded | tid="139883615182848" timestamp=1743287614
+Mar 29 15:33:34 launchpad ollama[1510]: time=2025-03-29T15:33:34.729-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 15:33:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:33:43 | 200 |  9.933743141s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:33:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:33:43 | 200 |  672.981374ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:33:44 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:33:44 | 200 |  883.962566ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:34:41 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:34:41 | 200 |  7.224379028s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:36:42 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:36:42 | 200 |  1.400049248s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:36:55 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:36:55 | 200 |  6.371107151s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:37:15 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:37:15 | 200 |   1.62644083s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:37:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:37:18 | 200 |  2.701827007s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:37:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:37:34 | 200 |  1.607649512s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:37:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:37:38 | 200 |  4.575297353s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:37:54 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:37:54 | 200 |  1.391617428s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:37:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:37:59 | 200 |  5.173918447s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:38:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:38:43 | 200 |  1.299442014s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:38:57 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:38:57 | 200 |  6.476457729s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:40:32 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:40:32 | 200 |  1.263283008s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:40:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:40:47 | 200 | 14.443386837s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:42:08 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:42:08 | 200 |      25.749µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:42:25 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:42:25 | 200 | 13.333347259s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:42:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:42:26 | 200 |  870.899897ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:42:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:42:27 | 200 |  1.233841321s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:45:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:45:40 | 200 |      26.033µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:46:00 launchpad ollama[1510]: time=2025-03-29T15:46:00.821-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.522-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9010544640 required="7.7 GiB"
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.522-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="37.0 GiB" free_swap="68.9 GiB"
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.522-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.523-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 37267"
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.523-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.523-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.524-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 15:46:01 launchpad ollama[62138]: INFO [main] build info | build=0 commit="unknown" tid="139889629814784" timestamp=1743288361
+Mar 29 15:46:01 launchpad ollama[62138]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139889629814784" timestamp=1743288361 total_threads=16
+Mar 29 15:46:01 launchpad ollama[62138]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37267" tid="139889629814784" timestamp=1743288361
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 15:46:01 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 15:46:01 launchpad ollama[1510]: time=2025-03-29T15:46:01.775-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 15:46:01 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 15:46:01 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 15:46:01 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 15:46:01 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 15:46:01 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 15:46:02 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 15:46:02 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 15:46:02 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 15:46:02 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 15:46:02 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 15:46:02 launchpad ollama[62138]: INFO [main] model loaded | tid="139889629814784" timestamp=1743288362
+Mar 29 15:46:02 launchpad ollama[1510]: time=2025-03-29T15:46:02.778-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 15:46:04 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:46:04 | 200 |  3.424421819s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:46:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:46:13 | 200 |  4.879807842s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:46:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:46:49 | 200 |  1.052225209s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:46:54 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:46:54 | 200 |  5.132097154s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:50:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:50:28 | 200 |        26.3µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:50:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:50:38 | 200 |    897.9825ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:50:51 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:50:51 | 200 |  6.948801191s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:52:21 launchpad ollama[1510]: time=2025-03-29T15:52:21.545-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="1.1 GiB"
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.273-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9037217792 required="6.5 GiB"
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.273-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="37.4 GiB" free_swap="68.9 GiB"
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.273-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.274-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37103"
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.274-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.274-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.275-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 15:52:22 launchpad ollama[63870]: INFO [main] build info | build=0 commit="unknown" tid="139973150744576" timestamp=1743288742
+Mar 29 15:52:22 launchpad ollama[63870]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139973150744576" timestamp=1743288742 total_threads=16
+Mar 29 15:52:22 launchpad ollama[63870]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37103" tid="139973150744576" timestamp=1743288742
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 15:52:22 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 15:52:22 launchpad ollama[1510]: time=2025-03-29T15:52:22.526-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 15:52:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 15:52:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 15:52:22 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 15:52:22 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 15:52:22 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 15:52:23 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 15:52:23 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 15:52:23 launchpad ollama[63870]: INFO [main] model loaded | tid="139973150744576" timestamp=1743288743
+Mar 29 15:52:23 launchpad ollama[1510]: time=2025-03-29T15:52:23.528-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 15:52:35 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:52:35 | 200 | 13.962988563s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:53:21 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:53:21 | 200 | 10.415352509s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:54:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:54:17 | 200 |  6.296363593s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:54:44 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:54:44 | 200 |  4.399340247s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:55:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:55:03 | 200 |  6.300933513s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:55:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:55:31 | 200 |     582.645µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:55:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:55:36 | 200 |      26.017µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:55:54 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:55:54 | 200 |  5.422678956s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:55:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:55:59 | 200 |      25.119µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:56:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:56:09 | 200 |  5.536962529s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:56:10 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:56:10 | 200 |  834.192484ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:56:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:56:11 | 200 |  683.826882ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:56:57 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:56:57 | 200 |     533.139µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 15:57:00 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:57:00 | 200 |      26.765µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 15:57:04 launchpad ollama[1510]: time=2025-03-29T15:57:04.394-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.107-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8998748160 required="6.2 GiB"
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.107-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="37.5 GiB" free_swap="68.9 GiB"
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.107-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.108-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39715"
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.109-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.109-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.109-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 15:57:05 launchpad ollama[65101]: INFO [main] build info | build=0 commit="unknown" tid="140646590676992" timestamp=1743289025
+Mar 29 15:57:05 launchpad ollama[65101]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140646590676992" timestamp=1743289025 total_threads=16
+Mar 29 15:57:05 launchpad ollama[65101]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39715" tid="140646590676992" timestamp=1743289025
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 15:57:05 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 15:57:05 launchpad ollama[1510]: time=2025-03-29T15:57:05.359-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 15:57:05 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 15:57:05 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 15:57:05 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 15:57:05 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 15:57:05 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 15:57:06 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 15:57:06 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 15:57:06 launchpad ollama[65101]: INFO [main] model loaded | tid="140646590676992" timestamp=1743289026
+Mar 29 15:57:06 launchpad ollama[1510]: time=2025-03-29T15:57:06.363-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 15:57:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:57:09 | 200 |  5.238370692s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:57:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:57:09 | 200 |  403.352434ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:57:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:57:11 | 200 |   1.72354465s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:58:01 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:58:01 | 200 |  2.934241667s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 15:58:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 15:58:13 | 200 |     559.124µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 16:00:52 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:00:52 | 200 |      26.432µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 16:01:22 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:01:22 | 200 |      36.682µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 16:02:02 launchpad ollama[1510]: time=2025-03-29T16:02:02.865-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=system
+Mar 29 16:02:04 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:02:04 | 200 |  2.127141193s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:02:19 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:02:19 | 200 |      25.814µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 16:02:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:02:28 | 200 |      25.063µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 16:02:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:02:47 | 200 |  1.347830878s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:02:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:02:47 | 200 |  369.390909ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:02:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:02:49 | 200 |  1.428751454s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:03:08 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:03:08 | 200 |  2.267334677s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:03:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:03:38 | 200 |  1.247778535s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:04:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:04:47 | 200 |  3.313803625s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:05:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:05:31 | 200 |  4.136510182s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:06:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:06:09 | 200 |  4.212121131s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:07:54 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:07:54 | 200 |  5.164935817s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:23:07 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:23:07 | 200 |     628.736µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 16:38:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:38:28 | 200 |     531.066µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.368-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8988655616 required="6.2 GiB"
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.368-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="35.4 GiB" free_swap="68.9 GiB"
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.369-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.370-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38905"
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.372-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.372-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.372-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 16:42:23 launchpad ollama[91493]: INFO [main] build info | build=0 commit="unknown" tid="140000488312832" timestamp=1743291743
+Mar 29 16:42:23 launchpad ollama[91493]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140000488312832" timestamp=1743291743 total_threads=16
+Mar 29 16:42:23 launchpad ollama[91493]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38905" tid="140000488312832" timestamp=1743291743
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 16:42:23 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 16:42:23 launchpad ollama[1510]: time=2025-03-29T16:42:23.623-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 16:42:23 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 16:42:23 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 16:42:23 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 16:42:23 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 16:42:23 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 16:42:24 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 16:42:24 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 16:42:24 launchpad ollama[91493]: INFO [main] model loaded | tid="140000488312832" timestamp=1743291744
+Mar 29 16:42:24 launchpad ollama[1510]: time=2025-03-29T16:42:24.627-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 16:42:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:42:30 | 200 |  7.653093301s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.233-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8867414016 required="6.2 GiB"
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.233-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="36.4 GiB" free_swap="68.9 GiB"
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.233-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.235-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38435"
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.235-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.235-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.235-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 16:51:33 launchpad ollama[94118]: INFO [main] build info | build=0 commit="unknown" tid="139900342849536" timestamp=1743292293
+Mar 29 16:51:33 launchpad ollama[94118]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139900342849536" timestamp=1743292293 total_threads=16
+Mar 29 16:51:33 launchpad ollama[94118]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38435" tid="139900342849536" timestamp=1743292293
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 16:51:33 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 16:51:33 launchpad ollama[1510]: time=2025-03-29T16:51:33.486-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 16:51:33 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 16:51:33 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 16:51:33 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 16:51:33 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 16:51:33 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 16:51:34 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 16:51:34 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 16:51:34 launchpad ollama[94118]: INFO [main] model loaded | tid="139900342849536" timestamp=1743292294
+Mar 29 16:51:34 launchpad ollama[1510]: time=2025-03-29T16:51:34.490-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 16:51:39 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:51:39 | 200 |  6.709006721s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 16:54:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:54:09 | 200 |     514.283µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 16:56:24 launchpad ollama[1510]: [GIN] 2025/03/29 - 16:56:24 | 200 |     537.924µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 17:00:54 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:00:54 | 200 |     581.225µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 17:06:45 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:06:45 | 200 |     528.457µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 17:12:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:12:03 | 200 |     511.642µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 17:14:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:14:33 | 200 |      36.602µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 17:16:29 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:16:29 | 200 |      26.409µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 17:18:06 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:18:06 | 200 |      26.305µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.278-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=9077325824 required="7.7 GiB"
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.278-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="38.1 GiB" free_swap="68.9 GiB"
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.278-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.279-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 41413"
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.279-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.279-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.280-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 17:18:25 launchpad ollama[106886]: INFO [main] build info | build=0 commit="unknown" tid="139720255180800" timestamp=1743293905
+Mar 29 17:18:25 launchpad ollama[106886]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139720255180800" timestamp=1743293905 total_threads=16
+Mar 29 17:18:25 launchpad ollama[106886]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41413" tid="139720255180800" timestamp=1743293905
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 17:18:25 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 17:18:25 launchpad ollama[1510]: time=2025-03-29T17:18:25.530-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 17:18:25 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 17:18:25 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 17:18:25 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 17:18:25 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 17:18:25 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 16384
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 17:18:26 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 17:18:26 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 17:18:26 launchpad ollama[106886]: INFO [main] model loaded | tid="139720255180800" timestamp=1743293906
+Mar 29 17:18:26 launchpad ollama[1510]: time=2025-03-29T17:18:26.534-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 17:18:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:18:34 | 200 |  9.681979087s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 17:18:35 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:18:35 | 200 |  677.945416ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 17:18:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:18:38 | 200 |  2.856671931s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 17:37:32 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:37:32 | 200 |      28.974µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.715-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8734965760 required="6.2 GiB"
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.715-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.7 GiB" free_swap="68.9 GiB"
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.715-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.716-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45447"
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.717-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.717-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.717-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 17:37:50 launchpad ollama[113882]: INFO [main] build info | build=0 commit="unknown" tid="140641303601152" timestamp=1743295070
+Mar 29 17:37:50 launchpad ollama[113882]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140641303601152" timestamp=1743295070 total_threads=16
+Mar 29 17:37:50 launchpad ollama[113882]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45447" tid="140641303601152" timestamp=1743295070
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 17:37:50 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 17:37:50 launchpad ollama[1510]: time=2025-03-29T17:37:50.968-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 17:37:50 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 17:37:51 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 17:37:51 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 17:37:51 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 17:37:51 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 17:37:51 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 17:37:51 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 17:37:51 launchpad ollama[113882]: INFO [main] model loaded | tid="140641303601152" timestamp=1743295071
+Mar 29 17:37:51 launchpad ollama[1510]: time=2025-03-29T17:37:51.971-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 17:37:57 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:37:57 | 200 |  6.687304313s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 17:43:55 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:43:55 | 200 |     698.996µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 17:58:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:58:46 | 200 |     598.295µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 17:59:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:59:28 | 200 |      32.202µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.440-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8898215936 required="6.2 GiB"
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.440-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="31.4 GiB" free_swap="68.9 GiB"
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.441-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.442-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45883"
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.442-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.442-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.442-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 17:59:32 launchpad ollama[122308]: INFO [main] build info | build=0 commit="unknown" tid="140180380872704" timestamp=1743296372
+Mar 29 17:59:32 launchpad ollama[122308]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140180380872704" timestamp=1743296372 total_threads=16
+Mar 29 17:59:32 launchpad ollama[122308]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45883" tid="140180380872704" timestamp=1743296372
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 17:59:32 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 17:59:32 launchpad ollama[1510]: time=2025-03-29T17:59:32.693-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 17:59:32 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 17:59:32 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 17:59:32 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 17:59:32 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 17:59:32 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 17:59:33 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 17:59:33 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 17:59:33 launchpad ollama[122308]: INFO [main] model loaded | tid="140180380872704" timestamp=1743296373
+Mar 29 17:59:33 launchpad ollama[1510]: time=2025-03-29T17:59:33.696-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 17:59:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:59:36 | 200 |  3.966688616s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 17:59:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:59:36 | 200 |  377.108442ms |       127.0.0.1 | POST     "/api/chat"
+Mar 29 17:59:39 launchpad ollama[1510]: [GIN] 2025/03/29 - 17:59:39 | 200 |  2.943474695s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:03:08 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:03:08 | 200 |  5.278321885s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:06:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:06:12 | 200 |  5.060123665s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:07:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:07:28 | 200 |     964.908µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 18:07:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:07:33 | 200 |      27.917µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 18:07:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:07:49 | 200 |      27.009µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 18:09:47 launchpad ollama[1510]: time=2025-03-29T18:09:47.378-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.5 GiB"
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.080-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8905359360 required="6.5 GiB"
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.080-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="31.4 GiB" free_swap="68.9 GiB"
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.080-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.082-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46311"
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.082-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.082-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.082-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 18:09:48 launchpad ollama[125078]: INFO [main] build info | build=0 commit="unknown" tid="140020102225920" timestamp=1743296988
+Mar 29 18:09:48 launchpad ollama[125078]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140020102225920" timestamp=1743296988 total_threads=16
+Mar 29 18:09:48 launchpad ollama[125078]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46311" tid="140020102225920" timestamp=1743296988
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 18:09:48 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 18:09:48 launchpad ollama[1510]: time=2025-03-29T18:09:48.333-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 18:09:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 18:09:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 18:09:48 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 18:09:48 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 18:09:48 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 18:09:49 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 18:09:49 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 18:09:49 launchpad ollama[125078]: INFO [main] model loaded | tid="140020102225920" timestamp=1743296989
+Mar 29 18:09:49 launchpad ollama[1510]: time=2025-03-29T18:09:49.337-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 18:10:02 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:10:02 | 200 | 15.523117655s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:10:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:10:03 | 200 |  1.017832846s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:10:05 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:10:05 | 200 |  1.236342831s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:13:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:13:28 | 200 |      28.056µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 18:13:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:13:49 | 200 |      25.423µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 18:16:48 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:16:48 | 200 |      28.128µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 18:21:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:21:11 | 200 |      29.659µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.851-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8932360192 required="6.2 GiB"
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.851-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="31.2 GiB" free_swap="68.9 GiB"
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.851-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.852-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38689"
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.853-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.853-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 18:22:33 launchpad ollama[1510]: time=2025-03-29T18:22:33.853-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 18:22:33 launchpad ollama[129081]: INFO [main] build info | build=0 commit="unknown" tid="140086025785344" timestamp=1743297753
+Mar 29 18:22:33 launchpad ollama[129081]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140086025785344" timestamp=1743297753 total_threads=16
+Mar 29 18:22:33 launchpad ollama[129081]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38689" tid="140086025785344" timestamp=1743297753
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 18:22:33 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 18:22:34 launchpad ollama[1510]: time=2025-03-29T18:22:34.104-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 18:22:34 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 18:22:34 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 18:22:34 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 18:22:34 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 18:22:34 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 18:22:34 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 18:22:34 launchpad ollama[129081]: INFO [main] model loaded | tid="140086025785344" timestamp=1743297754
+Mar 29 18:22:35 launchpad ollama[1510]: time=2025-03-29T18:22:35.108-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 18:22:35 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:22:35 | 200 |  1.629465188s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 18:22:35 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:22:35 | 200 |  188.390586ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 18:22:35 launchpad ollama[1510]: time=2025-03-29T18:22:35.689-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.373-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8932491264 required="6.5 GiB"
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.373-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="31.1 GiB" free_swap="68.9 GiB"
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.373-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.374-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44743"
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.374-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.374-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.374-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 18:22:36 launchpad ollama[129113]: INFO [main] build info | build=0 commit="unknown" tid="140247785431040" timestamp=1743297756
+Mar 29 18:22:36 launchpad ollama[129113]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140247785431040" timestamp=1743297756 total_threads=16
+Mar 29 18:22:36 launchpad ollama[129113]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44743" tid="140247785431040" timestamp=1743297756
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 18:22:36 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 18:22:36 launchpad ollama[1510]: time=2025-03-29T18:22:36.625-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 18:22:36 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 18:22:36 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 18:22:36 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 18:22:36 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 18:22:36 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 18:22:37 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 18:22:37 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 18:22:37 launchpad ollama[129113]: INFO [main] model loaded | tid="140247785431040" timestamp=1743297757
+Mar 29 18:22:37 launchpad ollama[1510]: time=2025-03-29T18:22:37.628-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 18:22:50 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:22:50 | 200 | 14.653973203s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:22:51 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:22:51 | 200 |  1.664478257s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:22:53 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:22:53 | 200 |  1.216371711s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 18:23:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:23:59 | 200 |     612.232µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 18:25:48 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:25:48 | 200 |     518.057µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 18:38:59 launchpad ollama[1510]: [GIN] 2025/03/29 - 18:38:59 | 200 |     1.95315ms |       127.0.0.1 | GET      "/api/tags"
+Mar 29 19:03:01 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:03:01 | 200 |     710.615µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 19:14:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:14:13 | 200 |     571.773µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 19:14:41 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:14:41 | 200 |      32.573µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.033-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8947302400 required="6.2 GiB"
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.033-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.033-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.035-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42605"
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.035-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.035-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.035-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:15:39 launchpad ollama[144822]: INFO [main] build info | build=0 commit="unknown" tid="139766912868352" timestamp=1743300939
+Mar 29 19:15:39 launchpad ollama[144822]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139766912868352" timestamp=1743300939 total_threads=16
+Mar 29 19:15:39 launchpad ollama[144822]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42605" tid="139766912868352" timestamp=1743300939
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:15:39 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:15:39 launchpad ollama[1510]: time=2025-03-29T19:15:39.286-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:15:39 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:15:39 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:15:39 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:15:39 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:15:39 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:15:40 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:15:40 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:15:40 launchpad ollama[144822]: INFO [main] model loaded | tid="139766912868352" timestamp=1743300940
+Mar 29 19:15:40 launchpad ollama[1510]: time=2025-03-29T19:15:40.290-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:15:41 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:15:41 | 200 |  3.014868197s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:16:06 launchpad ollama[1510]: time=2025-03-29T19:16:06.400-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.123-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8947040256 required="6.5 GiB"
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.123-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.123-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.124-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33891"
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.124-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.124-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.124-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:16:07 launchpad ollama[144939]: INFO [main] build info | build=0 commit="unknown" tid="140273985212416" timestamp=1743300967
+Mar 29 19:16:07 launchpad ollama[144939]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140273985212416" timestamp=1743300967 total_threads=16
+Mar 29 19:16:07 launchpad ollama[144939]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33891" tid="140273985212416" timestamp=1743300967
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:16:07 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:16:07 launchpad ollama[1510]: time=2025-03-29T19:16:07.376-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:16:07 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:16:07 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:16:07 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:16:07 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:16:07 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:16:09 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:16:09 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:16:09 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:16:09 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:16:09 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:16:10 launchpad ollama[144939]: INFO [main] model loaded | tid="140273985212416" timestamp=1743300970
+Mar 29 19:16:10 launchpad ollama[1510]: time=2025-03-29T19:16:10.133-07:00 level=INFO source=server.go:626 msg="llama runner started in 3.01 seconds"
+Mar 29 19:16:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:17 | 200 |  11.65726168s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:16:18 launchpad ollama[1510]: time=2025-03-29T19:16:18.965-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.682-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8947302400 required="6.2 GiB"
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.682-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.7 GiB" free_swap="68.9 GiB"
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.682-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.683-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34981"
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.684-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.684-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.684-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:16:19 launchpad ollama[145026]: INFO [main] build info | build=0 commit="unknown" tid="140137406337024" timestamp=1743300979
+Mar 29 19:16:19 launchpad ollama[145026]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140137406337024" timestamp=1743300979 total_threads=16
+Mar 29 19:16:19 launchpad ollama[145026]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34981" tid="140137406337024" timestamp=1743300979
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:16:19 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:16:19 launchpad ollama[1510]: time=2025-03-29T19:16:19.935-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:16:19 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:16:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:16:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:16:19 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:16:19 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:16:20 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:16:20 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:16:20 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:16:20 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:16:20 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:16:20 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:16:20 launchpad ollama[145026]: INFO [main] model loaded | tid="140137406337024" timestamp=1743300980
+Mar 29 19:16:20 launchpad ollama[1510]: time=2025-03-29T19:16:20.938-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:16:21 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:21 | 200 |  2.304047755s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:16:25 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:25 | 200 |   164.46829ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:16:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:26 | 200 |  1.042665096s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:16:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:26 | 200 |   82.974107ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:16:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:26 | 200 |  164.100542ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:16:26 launchpad ollama[1510]: time=2025-03-29T19:16:26.768-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.474-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8947302400 required="6.5 GiB"
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.474-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.5 GiB" free_swap="68.9 GiB"
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.475-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.475-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42745"
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.476-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.476-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.476-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:16:27 launchpad ollama[145107]: INFO [main] build info | build=0 commit="unknown" tid="139800153673728" timestamp=1743300987
+Mar 29 19:16:27 launchpad ollama[145107]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139800153673728" timestamp=1743300987 total_threads=16
+Mar 29 19:16:27 launchpad ollama[145107]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42745" tid="139800153673728" timestamp=1743300987
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:16:27 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:16:27 launchpad ollama[1510]: time=2025-03-29T19:16:27.727-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:16:27 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:16:27 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:16:27 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:16:27 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:16:27 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:16:28 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:16:28 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:16:28 launchpad ollama[145107]: INFO [main] model loaded | tid="139800153673728" timestamp=1743300988
+Mar 29 19:16:28 launchpad ollama[1510]: time=2025-03-29T19:16:28.731-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:16:36 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:16:36 | 200 |  9.842079814s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:18:09 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:09 | 200 |  1.638019863s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:18:10 launchpad ollama[1510]: time=2025-03-29T19:18:10.324-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.027-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8947302400 required="6.2 GiB"
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.027-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.3 GiB" free_swap="68.9 GiB"
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.027-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.028-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39675"
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.029-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.029-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.029-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:18:11 launchpad ollama[145577]: INFO [main] build info | build=0 commit="unknown" tid="140691987349504" timestamp=1743301091
+Mar 29 19:18:11 launchpad ollama[145577]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140691987349504" timestamp=1743301091 total_threads=16
+Mar 29 19:18:11 launchpad ollama[145577]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39675" tid="140691987349504" timestamp=1743301091
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:18:11 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:18:11 launchpad ollama[1510]: time=2025-03-29T19:18:11.280-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:18:11 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:18:11 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:18:11 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:18:11 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:18:11 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:18:11 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:18:11 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:18:11 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:18:11 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:18:11 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:18:11 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:18:12 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:18:12 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:18:12 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:18:12 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:18:12 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:18:12 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:18:12 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:18:12 launchpad ollama[145577]: INFO [main] model loaded | tid="140691987349504" timestamp=1743301092
+Mar 29 19:18:12 launchpad ollama[1510]: time=2025-03-29T19:18:12.284-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:18:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:12 | 200 |  2.342627575s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:18:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:12 | 200 |  165.867903ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:18:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:13 | 200 |  607.373291ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:18:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:13 | 200 |  166.575372ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:18:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:13 | 200 |  167.455848ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:18:13 launchpad ollama[1510]: time=2025-03-29T19:18:13.801-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.524-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8947302400 required="6.5 GiB"
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.524-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.524-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.525-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41827"
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.525-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.525-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.526-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:18:14 launchpad ollama[145623]: INFO [main] build info | build=0 commit="unknown" tid="139784904335360" timestamp=1743301094
+Mar 29 19:18:14 launchpad ollama[145623]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139784904335360" timestamp=1743301094 total_threads=16
+Mar 29 19:18:14 launchpad ollama[145623]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41827" tid="139784904335360" timestamp=1743301094
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:18:14 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:18:14 launchpad ollama[1510]: time=2025-03-29T19:18:14.777-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:18:14 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:18:14 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:18:14 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:18:14 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:18:14 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:18:15 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:18:15 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:18:15 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:18:15 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:18:15 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:18:15 launchpad ollama[145623]: INFO [main] model loaded | tid="139784904335360" timestamp=1743301095
+Mar 29 19:18:15 launchpad ollama[1510]: time=2025-03-29T19:18:15.781-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:18:16 launchpad ollama[145623]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1023 n_keep=4 n_left=2044 n_shift=1022 tid="139784904335360" timestamp=1743301096
+Mar 29 19:18:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:18:33 | 200 | 19.739859237s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:21:05 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:05 | 200 |  7.414607705s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:21:15 launchpad ollama[1510]: time=2025-03-29T19:21:15.361-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.096-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8992129024 required="6.2 GiB"
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.096-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.096-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.097-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43935"
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.097-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.097-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.097-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:21:16 launchpad ollama[146421]: INFO [main] build info | build=0 commit="unknown" tid="140224637255680" timestamp=1743301276
+Mar 29 19:21:16 launchpad ollama[146421]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140224637255680" timestamp=1743301276 total_threads=16
+Mar 29 19:21:16 launchpad ollama[146421]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43935" tid="140224637255680" timestamp=1743301276
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:21:16 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:21:16 launchpad ollama[1510]: time=2025-03-29T19:21:16.349-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:21:16 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:21:16 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:21:16 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:21:16 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:21:16 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:21:17 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:21:17 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:21:17 launchpad ollama[146421]: INFO [main] model loaded | tid="140224637255680" timestamp=1743301277
+Mar 29 19:21:17 launchpad ollama[1510]: time=2025-03-29T19:21:17.352-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:21:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:17 | 200 |  2.345376212s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:21:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:17 | 200 |  178.181848ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:21:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:18 | 200 |  953.155192ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:21:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:18 | 200 |   95.123191ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:21:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:18 | 200 |  176.692628ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.184-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.885-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8992129024 required="6.5 GiB"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.885-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.885-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.886-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44869"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.886-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.886-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:21:19 launchpad ollama[1510]: time=2025-03-29T19:21:19.886-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:21:19 launchpad ollama[146455]: INFO [main] build info | build=0 commit="unknown" tid="140033613987840" timestamp=1743301279
+Mar 29 19:21:19 launchpad ollama[146455]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140033613987840" timestamp=1743301279 total_threads=16
+Mar 29 19:21:19 launchpad ollama[146455]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44869" tid="140033613987840" timestamp=1743301279
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:21:19 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:21:20 launchpad ollama[1510]: time=2025-03-29T19:21:20.138-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:21:20 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:21:20 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:21:20 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:21:20 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:21:20 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:21:20 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:21:20 launchpad ollama[146455]: INFO [main] model loaded | tid="140033613987840" timestamp=1743301280
+Mar 29 19:21:21 launchpad ollama[1510]: time=2025-03-29T19:21:21.141-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:21:39 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:21:39 | 200 | 20.608513117s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:24:32 launchpad ollama[146455]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3009 n_keep=4 n_left=2044 n_shift=1022 tid="140033613987840" timestamp=1743301472
+Mar 29 19:24:57 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:24:57 | 200 | 25.210920696s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:24:57 launchpad ollama[1510]: time=2025-03-29T19:24:57.719-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.485-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8992129024 required="6.2 GiB"
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.485-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.5 GiB" free_swap="68.9 GiB"
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.485-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.486-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33181"
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.486-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.486-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.486-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:24:58 launchpad ollama[147404]: INFO [main] build info | build=0 commit="unknown" tid="140250669076480" timestamp=1743301498
+Mar 29 19:24:58 launchpad ollama[147404]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140250669076480" timestamp=1743301498 total_threads=16
+Mar 29 19:24:58 launchpad ollama[147404]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33181" tid="140250669076480" timestamp=1743301498
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:24:58 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:24:58 launchpad ollama[1510]: time=2025-03-29T19:24:58.738-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:24:58 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:24:58 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:24:58 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:24:58 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:24:58 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:24:59 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:24:59 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:24:59 launchpad ollama[147404]: INFO [main] model loaded | tid="140250669076480" timestamp=1743301499
+Mar 29 19:24:59 launchpad ollama[1510]: time=2025-03-29T19:24:59.742-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:25:00 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:25:00 | 200 |  3.173991021s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:25:01 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:25:01 | 200 |  982.464629ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:25:02 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:25:02 | 200 |  870.407053ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:25:03 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:25:03 | 200 |  941.090479ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:25:04 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:25:04 | 200 |  981.459742ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:25:04 launchpad ollama[1510]: time=2025-03-29T19:25:04.762-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.468-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8992129024 required="6.5 GiB"
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.469-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.5 GiB" free_swap="68.9 GiB"
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.469-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.470-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44611"
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.470-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.470-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.470-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:25:05 launchpad ollama[147480]: INFO [main] build info | build=0 commit="unknown" tid="140124223516672" timestamp=1743301505
+Mar 29 19:25:05 launchpad ollama[147480]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140124223516672" timestamp=1743301505 total_threads=16
+Mar 29 19:25:05 launchpad ollama[147480]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44611" tid="140124223516672" timestamp=1743301505
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:25:05 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:25:05 launchpad ollama[1510]: time=2025-03-29T19:25:05.721-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:25:05 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:25:05 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:25:05 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:25:05 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:25:05 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:25:06 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:25:06 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:25:06 launchpad ollama[147480]: INFO [main] model loaded | tid="140124223516672" timestamp=1743301506
+Mar 29 19:25:06 launchpad ollama[1510]: time=2025-03-29T19:25:06.724-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:25:25 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:25:25 | 200 | 20.884409081s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:26:13 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:26:13 | 200 |      24.915µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.180-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.899-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.2 GiB"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.899-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.5 GiB" free_swap="68.9 GiB"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.899-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.900-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39859"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.900-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.900-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:27:44 launchpad ollama[1510]: time=2025-03-29T19:27:44.900-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:27:44 launchpad ollama[148205]: INFO [main] build info | build=0 commit="unknown" tid="140328937287680" timestamp=1743301664
+Mar 29 19:27:44 launchpad ollama[148205]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140328937287680" timestamp=1743301664 total_threads=16
+Mar 29 19:27:44 launchpad ollama[148205]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39859" tid="140328937287680" timestamp=1743301664
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:27:44 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:27:45 launchpad ollama[1510]: time=2025-03-29T19:27:45.152-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:27:45 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:27:45 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:27:45 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:27:45 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:27:45 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:27:45 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:27:45 launchpad ollama[148205]: INFO [main] model loaded | tid="140328937287680" timestamp=1743301665
+Mar 29 19:27:46 launchpad ollama[1510]: time=2025-03-29T19:27:46.156-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:27:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:27:46 | 200 |  2.352563192s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:27:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:27:46 | 200 |  133.497156ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:27:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:27:47 | 200 |  848.001811ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:27:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:27:47 | 200 |   90.798322ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:27:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:27:47 | 200 |  174.689794ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:27:47 launchpad ollama[1510]: time=2025-03-29T19:27:47.829-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.548-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.5 GiB"
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.548-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.5 GiB" free_swap="68.9 GiB"
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.548-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.549-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38387"
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.549-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.549-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.549-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:27:48 launchpad ollama[148239]: INFO [main] build info | build=0 commit="unknown" tid="140639076442112" timestamp=1743301668
+Mar 29 19:27:48 launchpad ollama[148239]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140639076442112" timestamp=1743301668 total_threads=16
+Mar 29 19:27:48 launchpad ollama[148239]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38387" tid="140639076442112" timestamp=1743301668
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:27:48 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:27:48 launchpad ollama[1510]: time=2025-03-29T19:27:48.800-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:27:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:27:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:27:48 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:27:48 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:27:48 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:27:49 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:27:49 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:27:49 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:27:49 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:27:49 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:27:49 launchpad ollama[148239]: INFO [main] model loaded | tid="140639076442112" timestamp=1743301669
+Mar 29 19:27:49 launchpad ollama[1510]: time=2025-03-29T19:27:49.802-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:28:24 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:28:24 | 200 | 36.600397079s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:29:47 launchpad ollama[148239]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1602 n_keep=4 n_left=2044 n_shift=1022 tid="140639076442112" timestamp=1743301787
+Mar 29 19:30:14 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:14 | 200 | 26.665771335s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:30:14 launchpad ollama[1510]: time=2025-03-29T19:30:14.398-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.2 GiB"
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.111-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.2 GiB"
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.111-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.5 GiB" free_swap="68.9 GiB"
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.112-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.113-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44021"
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.113-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.113-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.113-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:30:15 launchpad ollama[148901]: INFO [main] build info | build=0 commit="unknown" tid="140053578866688" timestamp=1743301815
+Mar 29 19:30:15 launchpad ollama[148901]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140053578866688" timestamp=1743301815 total_threads=16
+Mar 29 19:30:15 launchpad ollama[148901]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44021" tid="140053578866688" timestamp=1743301815
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:30:15 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:30:15 launchpad ollama[1510]: time=2025-03-29T19:30:15.364-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:30:15 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:30:15 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:30:15 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:30:15 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:30:15 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:30:16 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:30:16 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:30:16 launchpad ollama[148901]: INFO [main] model loaded | tid="140053578866688" timestamp=1743301816
+Mar 29 19:30:16 launchpad ollama[1510]: time=2025-03-29T19:30:16.368-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:30:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:16 | 200 |  2.303012071s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:30:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:16 | 200 |   190.64113ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:30:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:17 | 200 |  933.720343ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:30:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:17 | 200 |  107.107543ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:30:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:17 | 200 |  188.381827ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.189-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.895-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8957394944 required="6.5 GiB"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.895-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.895-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.896-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35763"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.896-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.896-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:30:18 launchpad ollama[1510]: time=2025-03-29T19:30:18.897-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:30:18 launchpad ollama[148937]: INFO [main] build info | build=0 commit="unknown" tid="140544602787840" timestamp=1743301818
+Mar 29 19:30:18 launchpad ollama[148937]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140544602787840" timestamp=1743301818 total_threads=16
+Mar 29 19:30:18 launchpad ollama[148937]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35763" tid="140544602787840" timestamp=1743301818
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:30:18 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:30:19 launchpad ollama[1510]: time=2025-03-29T19:30:19.148-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:30:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:30:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:30:19 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:30:19 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:30:19 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:30:19 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:30:20 launchpad ollama[148937]: INFO [main] model loaded | tid="140544602787840" timestamp=1743301820
+Mar 29 19:30:20 launchpad ollama[1510]: time=2025-03-29T19:30:20.152-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:30:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:30:34 | 200 | 16.909366725s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:30:44 launchpad ollama[148937]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1384 n_keep=4 n_left=2044 n_shift=1022 tid="140544602787840" timestamp=1743301844
+Mar 29 19:31:08 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:08 | 200 |  25.01725736s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.073-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.778-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8957132800 required="6.2 GiB"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.778-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.778-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.779-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36695"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.779-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.779-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:31:09 launchpad ollama[1510]: time=2025-03-29T19:31:09.780-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:31:09 launchpad ollama[149206]: INFO [main] build info | build=0 commit="unknown" tid="140110638301184" timestamp=1743301869
+Mar 29 19:31:09 launchpad ollama[149206]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140110638301184" timestamp=1743301869 total_threads=16
+Mar 29 19:31:09 launchpad ollama[149206]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36695" tid="140110638301184" timestamp=1743301869
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:31:09 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:31:10 launchpad ollama[1510]: time=2025-03-29T19:31:10.031-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:31:10 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:31:10 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:31:10 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:31:10 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:31:10 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:31:10 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:31:10 launchpad ollama[149206]: INFO [main] model loaded | tid="140110638301184" timestamp=1743301870
+Mar 29 19:31:10 launchpad ollama[1510]: time=2025-03-29T19:31:10.785-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.01 seconds"
+Mar 29 19:31:10 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:10 | 200 |  2.029661009s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:31:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:11 | 200 |  184.019162ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:31:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:12 | 200 |  948.727948ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:31:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:12 | 200 |    102.5928ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:31:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:12 | 200 |  185.694818ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:31:12 launchpad ollama[1510]: time=2025-03-29T19:31:12.597-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.302-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8957132800 required="6.5 GiB"
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.302-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.302-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.303-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36083"
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.303-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.303-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.304-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:31:13 launchpad ollama[149248]: INFO [main] build info | build=0 commit="unknown" tid="140352045539328" timestamp=1743301873
+Mar 29 19:31:13 launchpad ollama[149248]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140352045539328" timestamp=1743301873 total_threads=16
+Mar 29 19:31:13 launchpad ollama[149248]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36083" tid="140352045539328" timestamp=1743301873
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:31:13 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:31:13 launchpad ollama[1510]: time=2025-03-29T19:31:13.554-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:31:13 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:31:13 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:31:13 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:31:13 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:31:13 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:31:14 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:31:14 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:31:14 launchpad ollama[149248]: INFO [main] model loaded | tid="140352045539328" timestamp=1743301874
+Mar 29 19:31:14 launchpad ollama[1510]: time=2025-03-29T19:31:14.558-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:31:21 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:31:21 | 200 |  9.064561113s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:31:55 launchpad ollama[149248]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1649 n_keep=4 n_left=2044 n_shift=1022 tid="140352045539328" timestamp=1743301915
+Mar 29 19:32:08 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:08 | 200 | 12.886429699s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:32:08 launchpad ollama[1510]: time=2025-03-29T19:32:08.790-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.524-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.2 GiB"
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.524-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.524-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.525-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40947"
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.525-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.525-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.525-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:32:09 launchpad ollama[149506]: INFO [main] build info | build=0 commit="unknown" tid="140410352848896" timestamp=1743301929
+Mar 29 19:32:09 launchpad ollama[149506]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140410352848896" timestamp=1743301929 total_threads=16
+Mar 29 19:32:09 launchpad ollama[149506]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40947" tid="140410352848896" timestamp=1743301929
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:32:09 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:32:09 launchpad ollama[1510]: time=2025-03-29T19:32:09.776-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:32:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:32:09 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:32:09 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:32:09 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:32:09 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:32:10 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:32:10 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:32:10 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:32:10 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:32:10 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:32:10 launchpad ollama[149506]: INFO [main] model loaded | tid="140410352848896" timestamp=1743301930
+Mar 29 19:32:10 launchpad ollama[1510]: time=2025-03-29T19:32:10.780-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:32:10 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:10 | 200 |  2.331624054s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:32:11 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:11 | 200 |  168.753064ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:32:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:12 | 200 |   1.03890153s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:32:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:12 | 200 |  127.578393ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:32:12 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:12 | 200 |  166.203922ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:32:12 launchpad ollama[1510]: time=2025-03-29T19:32:12.705-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.404-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.5 GiB"
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.404-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.404-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.405-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43015"
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.406-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.406-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.406-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:32:13 launchpad ollama[149577]: INFO [main] build info | build=0 commit="unknown" tid="140671766847488" timestamp=1743301933
+Mar 29 19:32:13 launchpad ollama[149577]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140671766847488" timestamp=1743301933 total_threads=16
+Mar 29 19:32:13 launchpad ollama[149577]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43015" tid="140671766847488" timestamp=1743301933
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:32:13 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:32:13 launchpad ollama[1510]: time=2025-03-29T19:32:13.657-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:32:13 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:32:13 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:32:13 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:32:13 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:32:13 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:32:14 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:32:14 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:32:14 launchpad ollama[149577]: INFO [main] model loaded | tid="140671766847488" timestamp=1743301934
+Mar 29 19:32:14 launchpad ollama[1510]: time=2025-03-29T19:32:14.661-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:32:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:32:43 | 200 | 30.733695259s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:32:57 launchpad ollama[1510]: time=2025-03-29T19:32:57.617-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.336-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.2 GiB"
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.336-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.336-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.337-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46135"
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.337-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.338-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.338-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:32:58 launchpad ollama[149801]: INFO [main] build info | build=0 commit="unknown" tid="140039007854592" timestamp=1743301978
+Mar 29 19:32:58 launchpad ollama[149801]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140039007854592" timestamp=1743301978 total_threads=16
+Mar 29 19:32:58 launchpad ollama[149801]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46135" tid="140039007854592" timestamp=1743301978
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:32:58 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:32:58 launchpad ollama[1510]: time=2025-03-29T19:32:58.589-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:32:58 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:32:58 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:32:58 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:32:58 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:32:58 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:32:59 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:32:59 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:32:59 launchpad ollama[149801]: INFO [main] model loaded | tid="140039007854592" timestamp=1743301979
+Mar 29 19:32:59 launchpad ollama[1510]: time=2025-03-29T19:32:59.593-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:33:00 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:00 | 200 |   3.41028149s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:15 launchpad ollama[1510]: time=2025-03-29T19:33:15.939-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.647-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.5 GiB"
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.647-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.647-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.648-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33563"
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.648-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.648-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.648-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:33:16 launchpad ollama[149900]: INFO [main] build info | build=0 commit="unknown" tid="140138493497344" timestamp=1743301996
+Mar 29 19:33:16 launchpad ollama[149900]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140138493497344" timestamp=1743301996 total_threads=16
+Mar 29 19:33:16 launchpad ollama[149900]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33563" tid="140138493497344" timestamp=1743301996
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:33:16 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:33:16 launchpad ollama[1510]: time=2025-03-29T19:33:16.899-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:33:16 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:33:16 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:33:16 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:33:16 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:33:16 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:33:17 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:33:17 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:33:17 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:33:17 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:33:17 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:33:17 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:33:17 launchpad ollama[149900]: INFO [main] model loaded | tid="140138493497344" timestamp=1743301997
+Mar 29 19:33:17 launchpad ollama[1510]: time=2025-03-29T19:33:17.902-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:33:17 launchpad ollama[149900]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1287 n_keep=4 n_left=2044 n_shift=1022 tid="140138493497344" timestamp=1743301997
+Mar 29 19:33:29 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:29 | 200 | 13.579537466s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:33:29 launchpad ollama[1510]: time=2025-03-29T19:33:29.505-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.206-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.2 GiB"
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.206-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.206-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.207-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39367"
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.207-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.208-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.208-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:33:30 launchpad ollama[149990]: INFO [main] build info | build=0 commit="unknown" tid="140336398630912" timestamp=1743302010
+Mar 29 19:33:30 launchpad ollama[149990]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140336398630912" timestamp=1743302010 total_threads=16
+Mar 29 19:33:30 launchpad ollama[149990]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39367" tid="140336398630912" timestamp=1743302010
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:33:30 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:33:30 launchpad ollama[1510]: time=2025-03-29T19:33:30.459-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:33:30 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:33:30 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:33:30 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:33:30 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:33:30 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:33:31 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:33:31 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:33:31 launchpad ollama[149990]: INFO [main] model loaded | tid="140336398630912" timestamp=1743302011
+Mar 29 19:33:31 launchpad ollama[1510]: time=2025-03-29T19:33:31.462-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:33:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:31 | 200 |  2.283415193s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:31 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:31 | 200 |  155.392816ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:32 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:32 | 200 |  853.282051ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:32 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:32 | 200 |   74.143485ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:32 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:32 | 200 |  113.733131ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:33 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:33 | 200 |  1.134762366s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:34 | 200 |   73.051234ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:33:34 | 200 |  157.075762ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:33:34 launchpad ollama[1510]: time=2025-03-29T19:33:34.458-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.183-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8951103488 required="6.5 GiB"
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.183-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.183-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.184-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33705"
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.184-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.184-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.184-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:33:35 launchpad ollama[150028]: INFO [main] build info | build=0 commit="unknown" tid="140655059640320" timestamp=1743302015
+Mar 29 19:33:35 launchpad ollama[150028]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140655059640320" timestamp=1743302015 total_threads=16
+Mar 29 19:33:35 launchpad ollama[150028]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33705" tid="140655059640320" timestamp=1743302015
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:33:35 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:33:35 launchpad ollama[1510]: time=2025-03-29T19:33:35.435-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:33:35 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:33:35 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:33:35 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:33:35 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:33:35 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:33:36 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:33:36 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:33:36 launchpad ollama[150028]: INFO [main] model loaded | tid="140655059640320" timestamp=1743302016
+Mar 29 19:33:36 launchpad ollama[1510]: time=2025-03-29T19:33:36.438-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:34:06 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:34:06 | 200 | 32.315525661s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:35:30 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:35:30 | 200 |     575.985µs |       127.0.0.1 | GET      "/api/tags"
+Mar 29 19:35:34 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:35:34 | 200 |       24.44µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 19:36:02 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:36:02 | 200 |   5.89327997s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:36:13 launchpad ollama[1510]: time=2025-03-29T19:36:13.416-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.134-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8932032512 required="6.2 GiB"
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.134-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.134-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.135-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 32949"
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.136-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.136-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.136-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:36:14 launchpad ollama[150737]: INFO [main] build info | build=0 commit="unknown" tid="139902623862784" timestamp=1743302174
+Mar 29 19:36:14 launchpad ollama[150737]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139902623862784" timestamp=1743302174 total_threads=16
+Mar 29 19:36:14 launchpad ollama[150737]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32949" tid="139902623862784" timestamp=1743302174
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:36:14 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:36:14 launchpad ollama[1510]: time=2025-03-29T19:36:14.386-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:36:14 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:36:14 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:36:14 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:36:14 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:36:14 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:36:15 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:36:15 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:36:15 launchpad ollama[150737]: INFO [main] model loaded | tid="139902623862784" timestamp=1743302175
+Mar 29 19:36:15 launchpad ollama[1510]: time=2025-03-29T19:36:15.390-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:36:15 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:36:15 | 200 |   2.31917171s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:36:15 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:36:15 | 200 |    129.7234ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:36:15 launchpad ollama[1510]: time=2025-03-29T19:36:15.896-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.588-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8932032512 required="6.5 GiB"
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.588-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.589-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.589-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33651"
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.590-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.590-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.590-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:36:16 launchpad ollama[150780]: INFO [main] build info | build=0 commit="unknown" tid="140212018692096" timestamp=1743302176
+Mar 29 19:36:16 launchpad ollama[150780]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140212018692096" timestamp=1743302176 total_threads=16
+Mar 29 19:36:16 launchpad ollama[150780]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33651" tid="140212018692096" timestamp=1743302176
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:36:16 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:36:16 launchpad ollama[1510]: time=2025-03-29T19:36:16.840-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:36:16 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:36:16 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:36:16 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:36:16 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:36:16 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:36:17 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:36:17 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:36:17 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:36:17 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:36:17 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:36:17 launchpad ollama[150780]: INFO [main] model loaded | tid="140212018692096" timestamp=1743302177
+Mar 29 19:36:17 launchpad ollama[1510]: time=2025-03-29T19:36:17.845-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:36:37 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:36:37 | 200 | 21.335777997s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:36:38 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:36:38 | 200 |  1.279943052s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:36:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:36:40 | 200 |  1.732500632s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:38:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:38:20 | 200 | 12.653378378s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:38:20 launchpad ollama[1510]: time=2025-03-29T19:38:20.960-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.2 GiB"
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.662-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8945991680 required="6.2 GiB"
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.662-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.662-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.663-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42663"
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.663-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.663-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.663-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:38:21 launchpad ollama[151356]: INFO [main] build info | build=0 commit="unknown" tid="139869312413696" timestamp=1743302301
+Mar 29 19:38:21 launchpad ollama[151356]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139869312413696" timestamp=1743302301 total_threads=16
+Mar 29 19:38:21 launchpad ollama[151356]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42663" tid="139869312413696" timestamp=1743302301
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:38:21 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:38:21 launchpad ollama[1510]: time=2025-03-29T19:38:21.914-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:38:21 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:38:21 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:38:21 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:38:21 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:38:21 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:38:22 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:38:22 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:38:22 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:38:22 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:38:22 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:38:22 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:38:22 launchpad ollama[151356]: INFO [main] model loaded | tid="139869312413696" timestamp=1743302302
+Mar 29 19:38:22 launchpad ollama[1510]: time=2025-03-29T19:38:22.918-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:38:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:38:23 | 200 |  2.305049387s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:38:23 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:38:23 | 200 |  176.483754ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:38:23 launchpad ollama[1510]: time=2025-03-29T19:38:23.502-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.204-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8945991680 required="6.5 GiB"
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.204-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.205-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.206-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40331"
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.206-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.206-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.206-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:38:24 launchpad ollama[151388]: INFO [main] build info | build=0 commit="unknown" tid="140402633170944" timestamp=1743302304
+Mar 29 19:38:24 launchpad ollama[151388]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140402633170944" timestamp=1743302304 total_threads=16
+Mar 29 19:38:24 launchpad ollama[151388]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40331" tid="140402633170944" timestamp=1743302304
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:38:24 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:38:24 launchpad ollama[1510]: time=2025-03-29T19:38:24.458-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:38:24 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:38:24 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:38:24 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:38:24 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:38:24 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:38:25 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:38:25 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:38:25 launchpad ollama[151388]: INFO [main] model loaded | tid="140402633170944" timestamp=1743302305
+Mar 29 19:38:25 launchpad ollama[1510]: time=2025-03-29T19:38:25.462-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:38:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:38:40 | 200 | 17.534730037s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:40:25 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:40:25 | 200 | 13.699120848s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:40:25 launchpad ollama[1510]: time=2025-03-29T19:40:25.395-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.118-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8943894528 required="6.2 GiB"
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.118-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.118-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.119-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41631"
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.120-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.120-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.120-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:40:26 launchpad ollama[152028]: INFO [main] build info | build=0 commit="unknown" tid="139980962881536" timestamp=1743302426
+Mar 29 19:40:26 launchpad ollama[152028]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139980962881536" timestamp=1743302426 total_threads=16
+Mar 29 19:40:26 launchpad ollama[152028]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41631" tid="139980962881536" timestamp=1743302426
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:40:26 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:40:26 launchpad ollama[1510]: time=2025-03-29T19:40:26.371-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:40:26 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:40:26 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:40:26 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:40:26 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:40:26 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:40:27 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:40:27 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:40:27 launchpad ollama[152028]: INFO [main] model loaded | tid="139980962881536" timestamp=1743302427
+Mar 29 19:40:27 launchpad ollama[1510]: time=2025-03-29T19:40:27.374-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:40:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:40:27 | 200 |  2.321062718s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:40:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:40:27 | 200 |  171.984812ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:40:27 launchpad ollama[1510]: time=2025-03-29T19:40:27.946-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.642-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8943894528 required="6.5 GiB"
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.642-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.642-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.643-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34229"
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.644-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.644-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.644-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:40:28 launchpad ollama[152060]: INFO [main] build info | build=0 commit="unknown" tid="140371490562048" timestamp=1743302428
+Mar 29 19:40:28 launchpad ollama[152060]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140371490562048" timestamp=1743302428 total_threads=16
+Mar 29 19:40:28 launchpad ollama[152060]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34229" tid="140371490562048" timestamp=1743302428
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:40:28 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:40:28 launchpad ollama[1510]: time=2025-03-29T19:40:28.895-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:40:28 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:40:28 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:40:28 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:40:28 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:40:28 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:40:29 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:40:29 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:40:29 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:40:29 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:40:29 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:40:29 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:40:29 launchpad ollama[152060]: INFO [main] model loaded | tid="140371490562048" timestamp=1743302429
+Mar 29 19:40:29 launchpad ollama[1510]: time=2025-03-29T19:40:29.898-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:40:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:40:49 | 200 | 22.011537931s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:42:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:42:18 | 200 | 21.984149732s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.225-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.927-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8960671744 required="6.2 GiB"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.927-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.928-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.928-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39601"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.929-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.929-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:42:18 launchpad ollama[1510]: time=2025-03-29T19:42:18.929-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:42:18 launchpad ollama[152547]: INFO [main] build info | build=0 commit="unknown" tid="140265974153216" timestamp=1743302538
+Mar 29 19:42:18 launchpad ollama[152547]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140265974153216" timestamp=1743302538 total_threads=16
+Mar 29 19:42:18 launchpad ollama[152547]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39601" tid="140265974153216" timestamp=1743302538
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:42:18 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:42:19 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:42:19 launchpad ollama[1510]: time=2025-03-29T19:42:19.180-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:42:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:42:19 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:42:19 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:42:19 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:42:19 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:42:19 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:42:19 launchpad ollama[152547]: INFO [main] model loaded | tid="140265974153216" timestamp=1743302539
+Mar 29 19:42:20 launchpad ollama[1510]: time=2025-03-29T19:42:20.184-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:42:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:42:20 | 200 |  2.303696929s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:42:20 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:42:20 | 200 |  171.320282ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:42:20 launchpad ollama[1510]: time=2025-03-29T19:42:20.734-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.432-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8960671744 required="6.5 GiB"
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.432-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.433-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.434-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34841"
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.434-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.434-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.434-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:42:21 launchpad ollama[152614]: INFO [main] build info | build=0 commit="unknown" tid="140171713527808" timestamp=1743302541
+Mar 29 19:42:21 launchpad ollama[152614]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140171713527808" timestamp=1743302541 total_threads=16
+Mar 29 19:42:21 launchpad ollama[152614]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34841" tid="140171713527808" timestamp=1743302541
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:42:21 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:42:21 launchpad ollama[1510]: time=2025-03-29T19:42:21.685-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:42:21 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:42:21 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:42:21 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:42:21 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:42:21 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:42:22 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:42:22 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:42:22 launchpad ollama[152614]: INFO [main] model loaded | tid="140171713527808" timestamp=1743302542
+Mar 29 19:42:22 launchpad ollama[1510]: time=2025-03-29T19:42:22.688-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:42:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:42:40 | 200 | 19.623128288s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:43:44 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:43:44 | 200 |  6.429165086s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:43:44 launchpad ollama[1510]: time=2025-03-29T19:43:44.756-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.2 GiB"
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.466-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8941469696 required="6.2 GiB"
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.466-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.466-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.468-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45405"
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.468-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.468-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.468-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:43:45 launchpad ollama[152991]: INFO [main] build info | build=0 commit="unknown" tid="139653163413504" timestamp=1743302625
+Mar 29 19:43:45 launchpad ollama[152991]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139653163413504" timestamp=1743302625 total_threads=16
+Mar 29 19:43:45 launchpad ollama[152991]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45405" tid="139653163413504" timestamp=1743302625
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:43:45 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:43:45 launchpad ollama[1510]: time=2025-03-29T19:43:45.719-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:43:45 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:43:45 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:43:45 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:43:45 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:43:45 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:43:46 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:43:46 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:43:46 launchpad ollama[152991]: INFO [main] model loaded | tid="139653163413504" timestamp=1743302626
+Mar 29 19:43:46 launchpad ollama[1510]: time=2025-03-29T19:43:46.723-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:43:46 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:43:46 | 200 |  2.333994117s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:43:47 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:43:47 | 200 |  167.006683ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.269-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.994-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8941469696 required="6.5 GiB"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.995-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.995-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.996-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34321"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.996-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.996-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:43:47 launchpad ollama[1510]: time=2025-03-29T19:43:47.996-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:43:48 launchpad ollama[153025]: INFO [main] build info | build=0 commit="unknown" tid="140311399333888" timestamp=1743302628
+Mar 29 19:43:48 launchpad ollama[153025]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140311399333888" timestamp=1743302628 total_threads=16
+Mar 29 19:43:48 launchpad ollama[153025]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34321" tid="140311399333888" timestamp=1743302628
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:43:48 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:43:48 launchpad ollama[1510]: time=2025-03-29T19:43:48.247-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:43:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:43:48 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:43:48 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:43:48 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:43:48 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:43:49 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:43:49 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:43:49 launchpad ollama[153025]: INFO [main] model loaded | tid="140311399333888" timestamp=1743302629
+Mar 29 19:43:49 launchpad ollama[1510]: time=2025-03-29T19:43:49.252-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:44:04 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:44:04 | 200 | 17.402217395s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:47:10 launchpad ollama[153025]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1475 n_keep=4 n_left=2044 n_shift=1022 tid="140311399333888" timestamp=1743302830
+Mar 29 19:47:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:47:16 | 200 |  6.205132773s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:47:35 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:47:35 | 200 |      36.636µs |       127.0.0.1 | GET      "/api/version"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.143-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.843-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8966701056 required="6.2 GiB"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.843-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.843-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.844-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38745"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.844-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.844-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:48:10 launchpad ollama[1510]: time=2025-03-29T19:48:10.844-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:48:10 launchpad ollama[154151]: INFO [main] build info | build=0 commit="unknown" tid="139790732369920" timestamp=1743302890
+Mar 29 19:48:10 launchpad ollama[154151]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139790732369920" timestamp=1743302890 total_threads=16
+Mar 29 19:48:10 launchpad ollama[154151]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38745" tid="139790732369920" timestamp=1743302890
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:48:10 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:48:11 launchpad ollama[1510]: time=2025-03-29T19:48:11.095-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:48:11 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:48:11 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:48:11 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:48:11 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:48:11 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:48:11 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:48:11 launchpad ollama[154151]: INFO [main] model loaded | tid="139790732369920" timestamp=1743302891
+Mar 29 19:48:12 launchpad ollama[1510]: time=2025-03-29T19:48:12.098-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:48:16 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:48:16 | 200 |  6.364819376s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:50:00 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:50:00 | 200 |  3.466462233s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:50:29 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:50:29 | 200 |  6.391877077s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:51:49 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:49 | 200 |   176.22525ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:50 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:50 | 200 |  160.778236ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:50 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:50 | 200 |  786.856972ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:50 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:50 | 200 |   78.256203ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:51 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:51 | 200 |  161.909934ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:51 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:51 | 200 |  797.424571ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:52 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:52 | 200 |   78.491835ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:52 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:51:52 | 200 |  159.008679ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:51:52 launchpad ollama[1510]: time=2025-03-29T19:51:52.242-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=system
+Mar 29 19:51:52 launchpad ollama[1510]: time=2025-03-29T19:51:52.410-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.135-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8977448960 required="6.5 GiB"
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.135-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.4 GiB" free_swap="68.9 GiB"
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.136-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.137-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35249"
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.137-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.137-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.137-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:51:53 launchpad ollama[155156]: INFO [main] build info | build=0 commit="unknown" tid="140389238358016" timestamp=1743303113
+Mar 29 19:51:53 launchpad ollama[155156]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140389238358016" timestamp=1743303113 total_threads=16
+Mar 29 19:51:53 launchpad ollama[155156]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35249" tid="140389238358016" timestamp=1743303113
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:51:53 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:51:53 launchpad ollama[1510]: time=2025-03-29T19:51:53.388-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:51:53 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:51:53 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:51:53 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:51:53 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:51:53 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:51:54 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:51:54 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:51:54 launchpad ollama[155156]: INFO [main] model loaded | tid="140389238358016" timestamp=1743303114
+Mar 29 19:51:54 launchpad ollama[1510]: time=2025-03-29T19:51:54.391-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:52:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:52:28 | 200 | 35.790191346s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:54:14 launchpad ollama[1510]: time=2025-03-29T19:54:14.889-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.3 GiB"
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.593-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8977448960 required="6.2 GiB"
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.593-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.6 GiB" free_swap="68.9 GiB"
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.593-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.594-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44435"
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.594-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.594-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.594-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:54:15 launchpad ollama[155780]: INFO [main] build info | build=0 commit="unknown" tid="140376766713856" timestamp=1743303255
+Mar 29 19:54:15 launchpad ollama[155780]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140376766713856" timestamp=1743303255 total_threads=16
+Mar 29 19:54:15 launchpad ollama[155780]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44435" tid="140376766713856" timestamp=1743303255
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:54:15 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:54:15 launchpad ollama[1510]: time=2025-03-29T19:54:15.846-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:54:15 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:54:15 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:54:15 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:54:15 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:54:15 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:54:16 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:54:16 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:54:16 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:54:16 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:54:16 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:54:16 launchpad ollama[155780]: INFO [main] model loaded | tid="140376766713856" timestamp=1743303256
+Mar 29 19:54:16 launchpad ollama[1510]: time=2025-03-29T19:54:16.850-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:54:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:17 | 200 |  2.332103187s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:17 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:17 | 200 |  180.036992ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:18 | 200 |  865.748366ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:18 | 200 |  136.299068ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:18 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:18 | 200 |  176.635973ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:19 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:19 | 200 |  766.638269ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:19 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:19 | 200 |    95.16046ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:19 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:19 | 200 |  177.320643ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:54:19 launchpad ollama[1510]: time=2025-03-29T19:54:19.534-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=system
+Mar 29 19:54:19 launchpad ollama[1510]: time=2025-03-29T19:54:19.703-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.421-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8977448960 required="6.5 GiB"
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.421-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.6 GiB" free_swap="68.9 GiB"
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.422-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.423-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42129"
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.423-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.423-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.423-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:54:20 launchpad ollama[155852]: INFO [main] build info | build=0 commit="unknown" tid="140250340904960" timestamp=1743303260
+Mar 29 19:54:20 launchpad ollama[155852]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140250340904960" timestamp=1743303260 total_threads=16
+Mar 29 19:54:20 launchpad ollama[155852]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42129" tid="140250340904960" timestamp=1743303260
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:54:20 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:54:20 launchpad ollama[1510]: time=2025-03-29T19:54:20.674-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:54:20 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:54:20 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:54:20 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:54:20 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:54:20 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:54:21 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:54:21 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:54:21 launchpad ollama[155852]: INFO [main] model loaded | tid="140250340904960" timestamp=1743303261
+Mar 29 19:54:21 launchpad ollama[1510]: time=2025-03-29T19:54:21.678-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:54:40 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:54:40 | 200 | 21.020815256s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 19:58:24 launchpad ollama[1510]: time=2025-03-29T19:58:24.268-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.4 GiB"
+Mar 29 19:58:24 launchpad ollama[1510]: time=2025-03-29T19:58:24.999-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8977448960 required="6.2 GiB"
+Mar 29 19:58:24 launchpad ollama[1510]: time=2025-03-29T19:58:24.999-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.6 GiB" free_swap="68.9 GiB"
+Mar 29 19:58:24 launchpad ollama[1510]: time=2025-03-29T19:58:24.999-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:58:25 launchpad ollama[1510]: time=2025-03-29T19:58:25.000-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44513"
+Mar 29 19:58:25 launchpad ollama[1510]: time=2025-03-29T19:58:25.000-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:58:25 launchpad ollama[1510]: time=2025-03-29T19:58:25.000-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:58:25 launchpad ollama[1510]: time=2025-03-29T19:58:25.000-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:58:25 launchpad ollama[156905]: INFO [main] build info | build=0 commit="unknown" tid="140456813043712" timestamp=1743303505
+Mar 29 19:58:25 launchpad ollama[156905]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140456813043712" timestamp=1743303505 total_threads=16
+Mar 29 19:58:25 launchpad ollama[156905]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44513" tid="140456813043712" timestamp=1743303505
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 29 19:58:25 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 29 19:58:25 launchpad ollama[1510]: time=2025-03-29T19:58:25.251-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:58:25 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:58:25 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:58:25 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:58:25 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:58:25 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:58:25 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:58:26 launchpad ollama[156905]: INFO [main] model loaded | tid="140456813043712" timestamp=1743303506
+Mar 29 19:58:26 launchpad ollama[1510]: time=2025-03-29T19:58:26.255-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 29 19:58:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:26 | 200 |  2.382167762s |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:26 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:26 | 200 |  201.386254ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:27 | 200 |  871.494809ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:27 | 200 |  119.866405ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:27 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:27 | 200 |  204.574165ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:28 | 200 |  600.398574ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:28 | 200 |  120.301771ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:28 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:28 | 200 |  162.348729ms |       127.0.0.1 | POST     "/api/embed"
+Mar 29 19:58:28 launchpad ollama[1510]: time=2025-03-29T19:58:28.845-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=system
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.016-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.766-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8977448960 required="6.5 GiB"
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.766-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.6 GiB" free_swap="68.9 GiB"
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.766-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.767-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44621"
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.767-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.767-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 29 19:58:29 launchpad ollama[1510]: time=2025-03-29T19:58:29.767-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 29 19:58:29 launchpad ollama[156944]: INFO [main] build info | build=0 commit="unknown" tid="139723562520576" timestamp=1743303509
+Mar 29 19:58:29 launchpad ollama[156944]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139723562520576" timestamp=1743303509 total_threads=16
+Mar 29 19:58:29 launchpad ollama[156944]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44621" tid="139723562520576" timestamp=1743303509
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.type str              = model
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - type  f32:   66 tensors
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - type q4_K:  193 tensors
+Mar 29 19:58:29 launchpad ollama[1510]: llama_model_loader: - type q6_K:   33 tensors
+Mar 29 19:58:30 launchpad ollama[1510]: time=2025-03-29T19:58:30.018-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 131072
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 29 19:58:30 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 29 19:58:30 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 29 19:58:30 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 29 19:58:30 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 29 19:58:30 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 29 19:58:30 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 29 19:58:30 launchpad ollama[156944]: INFO [main] model loaded | tid="139723562520576" timestamp=1743303510
+Mar 29 19:58:31 launchpad ollama[1510]: time=2025-03-29T19:58:31.022-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 29 19:58:43 launchpad ollama[1510]: [GIN] 2025/03/29 - 19:58:43 | 200 | 15.143637267s |       127.0.0.1 | POST     "/api/chat"
+Mar 29 20:01:15 launchpad ollama[156944]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=2319 n_keep=4 n_left=2044 n_shift=1022 tid="139723562520576" timestamp=1743303675
+Mar 29 20:01:29 launchpad ollama[1510]: [GIN] 2025/03/29 - 20:01:29 | 200 | 14.849592956s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:14:54 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:14:54 | 200 |     623.106µs |       127.0.0.1 | GET      "/api/tags"
+Mar 30 11:14:54 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:14:54 | 200 |      31.653µs |       127.0.0.1 | GET      "/api/version"
+Mar 30 11:15:09 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:15:09 | 200 |      31.878µs |       127.0.0.1 | GET      "/api/version"
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.615-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8935112704 required="6.2 GiB"
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.615-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.615-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.617-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37551"
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.617-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.617-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.617-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 11:15:22 launchpad ollama[190865]: INFO [main] build info | build=0 commit="unknown" tid="140582430531584" timestamp=1743358522
+Mar 30 11:15:22 launchpad ollama[190865]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140582430531584" timestamp=1743358522 total_threads=16
+Mar 30 11:15:22 launchpad ollama[190865]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37551" tid="140582430531584" timestamp=1743358522
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 11:15:22 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 11:15:22 launchpad ollama[1510]: time=2025-03-30T11:15:22.868-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 11:15:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 11:15:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 11:15:22 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 11:15:22 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 11:15:22 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 11:15:23 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 11:15:23 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 11:15:23 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 11:15:23 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 11:15:23 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 11:15:23 launchpad ollama[190865]: INFO [main] model loaded | tid="140582430531584" timestamp=1743358523
+Mar 30 11:15:23 launchpad ollama[1510]: time=2025-03-30T11:15:23.870-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 11:15:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:15:33 | 200 | 10.617418065s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:15:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:15:33 | 200 |  671.686201ms |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:15:35 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:15:35 | 200 |  2.190579097s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:18:12 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:18:12 | 200 | 10.393218154s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:21:51 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:21:51 | 200 |  9.948355464s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.607-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9020768256 required="6.2 GiB"
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.608-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.608-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.609-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38379"
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.609-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.609-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.610-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 11:28:46 launchpad ollama[194362]: INFO [main] build info | build=0 commit="unknown" tid="139712057430016" timestamp=1743359326
+Mar 30 11:28:46 launchpad ollama[194362]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139712057430016" timestamp=1743359326 total_threads=16
+Mar 30 11:28:46 launchpad ollama[194362]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38379" tid="139712057430016" timestamp=1743359326
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 11:28:46 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 11:28:46 launchpad ollama[1510]: time=2025-03-30T11:28:46.860-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 11:28:46 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 11:28:46 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 11:28:46 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 11:28:46 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 11:28:46 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 11:28:47 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 11:28:47 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 11:28:47 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 11:28:47 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 11:28:47 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 11:28:47 launchpad ollama[194362]: INFO [main] model loaded | tid="139712057430016" timestamp=1743359327
+Mar 30 11:28:47 launchpad ollama[1510]: time=2025-03-30T11:28:47.863-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 11:28:59 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:28:59 | 200 | 12.679412756s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.346-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9045934080 required="6.2 GiB"
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.346-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.346-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.347-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35069"
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.347-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.347-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.348-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 11:35:36 launchpad ollama[196139]: INFO [main] build info | build=0 commit="unknown" tid="140001028354048" timestamp=1743359736
+Mar 30 11:35:36 launchpad ollama[196139]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140001028354048" timestamp=1743359736 total_threads=16
+Mar 30 11:35:36 launchpad ollama[196139]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35069" tid="140001028354048" timestamp=1743359736
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 11:35:36 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 11:35:36 launchpad ollama[1510]: time=2025-03-30T11:35:36.599-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 11:35:36 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 11:35:36 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 11:35:36 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 11:35:36 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 11:35:36 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 11:35:37 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 11:35:37 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 11:35:37 launchpad ollama[196139]: INFO [main] model loaded | tid="140001028354048" timestamp=1743359737
+Mar 30 11:35:37 launchpad ollama[1510]: time=2025-03-30T11:35:37.603-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 11:35:45 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:35:45 | 200 |  9.316356775s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.822-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9045934080 required="6.2 GiB"
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.822-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.822-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.823-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33833"
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.824-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.824-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 11:43:46 launchpad ollama[1510]: time=2025-03-30T11:43:46.824-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 11:43:46 launchpad ollama[198235]: INFO [main] build info | build=0 commit="unknown" tid="140325241839616" timestamp=1743360226
+Mar 30 11:43:46 launchpad ollama[198235]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140325241839616" timestamp=1743360226 total_threads=16
+Mar 30 11:43:46 launchpad ollama[198235]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33833" tid="140325241839616" timestamp=1743360226
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 11:43:46 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 11:43:47 launchpad ollama[1510]: time=2025-03-30T11:43:47.074-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 11:43:47 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 11:43:47 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 11:43:47 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 11:43:47 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 11:43:47 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 11:43:47 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 11:43:47 launchpad ollama[198235]: INFO [main] model loaded | tid="140325241839616" timestamp=1743360227
+Mar 30 11:43:48 launchpad ollama[1510]: time=2025-03-30T11:43:48.079-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 11:43:50 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:43:50 | 200 |  3.468874029s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:44:45 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:44:45 | 200 |  6.057648133s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.954-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9045934080 required="6.2 GiB"
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.954-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.955-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.956-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38283"
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.956-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.956-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 11:53:50 launchpad ollama[1510]: time=2025-03-30T11:53:50.956-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 11:53:50 launchpad ollama[200808]: INFO [main] build info | build=0 commit="unknown" tid="139775037440000" timestamp=1743360830
+Mar 30 11:53:50 launchpad ollama[200808]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139775037440000" timestamp=1743360830 total_threads=16
+Mar 30 11:53:50 launchpad ollama[200808]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38283" tid="139775037440000" timestamp=1743360830
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 11:53:51 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 11:53:51 launchpad ollama[1510]: time=2025-03-30T11:53:51.207-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 11:53:51 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 11:53:51 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 11:53:51 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 11:53:51 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 11:53:51 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 11:53:51 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 11:53:51 launchpad ollama[200808]: INFO [main] model loaded | tid="139775037440000" timestamp=1743360831
+Mar 30 11:53:52 launchpad ollama[1510]: time=2025-03-30T11:53:52.210-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 11:54:03 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:54:03 | 200 | 12.881324152s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 11:57:29 launchpad ollama[1510]: [GIN] 2025/03/30 - 11:57:29 | 200 |  5.018045448s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 12:01:12 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:01:12 | 200 |  5.891354226s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 12:02:55 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:02:55 | 200 | 11.031168326s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 12:03:18 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:03:18 | 200 |  5.128749077s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 12:03:43 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:03:43 | 200 |  3.159849674s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.021-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8995602432 required="6.2 GiB"
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.021-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="34.0 GiB" free_swap="68.9 GiB"
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.021-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.022-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44363"
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.023-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.023-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.023-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 12:58:17 launchpad ollama[217969]: INFO [main] build info | build=0 commit="unknown" tid="140634735337472" timestamp=1743364697
+Mar 30 12:58:17 launchpad ollama[217969]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140634735337472" timestamp=1743364697 total_threads=16
+Mar 30 12:58:17 launchpad ollama[217969]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44363" tid="140634735337472" timestamp=1743364697
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 12:58:17 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 12:58:17 launchpad ollama[1510]: time=2025-03-30T12:58:17.274-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 12:58:17 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 12:58:17 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 12:58:17 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 12:58:17 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 12:58:17 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 12:58:17 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 12:58:18 launchpad ollama[217969]: INFO [main] model loaded | tid="140634735337472" timestamp=1743364698
+Mar 30 12:58:18 launchpad ollama[1510]: time=2025-03-30T12:58:18.278-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 12:58:22 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:58:22 | 200 |  5.918463234s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:58:26 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:58:26 | 200 |   3.34311065s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:32 | 200 |  904.989732ms |       127.0.0.1 | POST     "/api/chat"
+Mar 30 12:59:40 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:40 | 200 |  3.682284619s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:42 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:42 | 200 |  2.106195761s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:43 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:43 | 200 |   94.146459ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:43 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:43 | 200 |  174.747682ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:44 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:44 | 200 |  1.020546761s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:44 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:44 | 200 |   52.718223ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:44 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:44 | 200 |  174.778406ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:45 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:45 | 200 |  990.164803ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 12:59:51 launchpad ollama[1510]: [GIN] 2025/03/30 - 12:59:51 | 200 |  6.282808147s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:00:51 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:00:51 | 200 |  5.588454751s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:00:55 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:00:55 | 200 |  4.103858218s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:01:12 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:12 | 200 |  188.608016ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:01:12 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:12 | 200 |  185.173926ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:01:13 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:13 | 200 |   1.10262918s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:01:17 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:17 | 200 |  3.974637544s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:01:49 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:49 | 200 |   1.67097826s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:01:50 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:50 | 200 |  185.531573ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:01:50 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:50 | 200 |  187.898384ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:01:51 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:01:51 | 200 |  770.498947ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:02:00 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:02:00 | 200 |  9.195115212s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:02:28 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:02:28 | 200 |  1.776794277s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:02:29 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:02:29 | 200 |  176.978784ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:02:29 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:02:29 | 200 |    174.6454ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:02:30 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:02:30 | 200 |  1.062071109s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:02:38 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:02:38 | 200 |  8.280355153s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:04:58 launchpad ollama[217969]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=4317 n_keep=24 n_left=2024 n_shift=1012 tid="140634735337472" timestamp=1743365098
+Mar 30 13:05:02 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:05:02 | 200 |  4.556235272s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:05:03 launchpad ollama[217969]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="140634735337472" timestamp=1743365103
+Mar 30 13:05:03 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:05:03 | 200 |  1.092054906s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:05:04 launchpad ollama[217969]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="140634735337472" timestamp=1743365104
+Mar 30 13:05:05 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:05:05 | 200 |  1.089213965s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:05:06 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:05:06 | 200 |  1.158635314s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:05:06 launchpad ollama[217969]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1302 n_keep=24 n_left=2024 n_shift=1012 tid="140634735337472" timestamp=1743365106
+Mar 30 13:05:12 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:05:12 | 200 |  5.933884693s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:08:25 launchpad ollama[217969]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1998 n_keep=24 n_left=2024 n_shift=1012 tid="140634735337472" timestamp=1743365305
+Mar 30 13:08:31 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:08:31 | 200 |  6.277962524s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:08:31 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:08:31 | 200 |  199.192105ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:08:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:08:32 | 200 |  200.257129ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:08:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:08:33 | 200 |  1.118850329s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:08:44 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:08:44 | 200 | 11.280558324s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:10:58 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:10:58 | 200 |  1.957155103s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:10:58 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:10:58 | 200 |  174.897519ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:10:59 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:10:59 | 200 |  177.076028ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:10:59 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:10:59 | 200 |  815.533281ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:11:09 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:11:09 | 200 |   9.72070433s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.773-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9020768256 required="6.2 GiB"
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.773-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.773-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.775-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45455"
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.775-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.775-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 13:16:26 launchpad ollama[1510]: time=2025-03-30T13:16:26.775-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 13:16:26 launchpad ollama[222729]: INFO [main] build info | build=0 commit="unknown" tid="139857329553408" timestamp=1743365786
+Mar 30 13:16:26 launchpad ollama[222729]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139857329553408" timestamp=1743365786 total_threads=16
+Mar 30 13:16:26 launchpad ollama[222729]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45455" tid="139857329553408" timestamp=1743365786
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 13:16:26 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 13:16:27 launchpad ollama[1510]: time=2025-03-30T13:16:27.026-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 13:16:27 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 13:16:27 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 13:16:27 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 13:16:27 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 13:16:27 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 13:16:27 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 13:16:27 launchpad ollama[222729]: INFO [main] model loaded | tid="139857329553408" timestamp=1743365787
+Mar 30 13:16:28 launchpad ollama[1510]: time=2025-03-30T13:16:28.030-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 13:16:29 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:16:29 | 200 |  2.966369395s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:16:30 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:16:30 | 200 |  198.167659ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:16:30 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:16:30 | 200 |  199.286527ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:16:31 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:16:31 | 200 |  919.299764ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:16:40 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:16:40 | 200 |  8.881800137s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:18:50 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:18:50 | 200 |  200.994987ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:18:50 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:18:50 | 200 |  182.687511ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:18:51 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:18:51 | 200 |  1.212547742s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:18:58 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:18:58 | 200 |  6.346474242s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:21:36 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:21:36 | 200 |  213.977971ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:21:36 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:21:36 | 200 |  190.166012ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:21:37 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:21:37 | 200 |   1.13022637s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:21:49 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:21:49 | 200 | 12.000309524s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.222-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9045934080 required="6.2 GiB"
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.222-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.222-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.224-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42283"
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.224-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.224-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.224-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 13:31:31 launchpad ollama[226605]: INFO [main] build info | build=0 commit="unknown" tid="140490867326976" timestamp=1743366691
+Mar 30 13:31:31 launchpad ollama[226605]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140490867326976" timestamp=1743366691 total_threads=16
+Mar 30 13:31:31 launchpad ollama[226605]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42283" tid="140490867326976" timestamp=1743366691
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 13:31:31 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 13:31:31 launchpad ollama[1510]: time=2025-03-30T13:31:31.474-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 13:31:31 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 13:31:31 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 13:31:31 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 13:31:31 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 13:31:31 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 13:31:32 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 13:31:32 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 13:31:32 launchpad ollama[226605]: INFO [main] model loaded | tid="140490867326976" timestamp=1743366692
+Mar 30 13:31:32 launchpad ollama[1510]: time=2025-03-30T13:31:32.478-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 13:31:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:31:32 | 200 |  1.618017075s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:31:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:31:32 | 200 |  218.929625ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:31:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:31:33 | 200 |  903.814406ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:31:45 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:31:45 | 200 |  11.67831139s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:35:30 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:35:30 | 200 |  215.015164ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:35:31 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:35:31 | 200 |  197.705893ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:35:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:35:32 | 200 |  1.150242635s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:35:43 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:35:43 | 200 | 11.022267554s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:37:55 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:37:55 | 200 |  194.491155ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:37:55 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:37:55 | 200 |  184.108833ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:37:56 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:37:56 | 200 |  1.094639872s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:38:08 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:38:08 | 200 | 12.487599999s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.007-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9019392000 required="6.2 GiB"
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.008-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.008-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.009-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35511"
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.009-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.009-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.009-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 13:44:22 launchpad ollama[229918]: INFO [main] build info | build=0 commit="unknown" tid="139861381177344" timestamp=1743367462
+Mar 30 13:44:22 launchpad ollama[229918]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139861381177344" timestamp=1743367462 total_threads=16
+Mar 30 13:44:22 launchpad ollama[229918]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35511" tid="139861381177344" timestamp=1743367462
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 13:44:22 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 13:44:22 launchpad ollama[1510]: time=2025-03-30T13:44:22.260-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 13:44:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 13:44:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 13:44:22 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 13:44:22 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 13:44:22 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 13:44:22 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 13:44:23 launchpad ollama[229918]: INFO [main] model loaded | tid="139861381177344" timestamp=1743367463
+Mar 30 13:44:23 launchpad ollama[1510]: time=2025-03-30T13:44:23.265-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 13:44:23 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:44:23 | 200 |  1.631172672s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:44:23 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:44:23 | 200 |  148.745836ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:44:24 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:44:24 | 200 |  923.062233ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:44:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:44:33 | 200 |  8.958894716s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.115-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9045803008 required="6.2 GiB"
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.115-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.115-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.116-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43271"
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.117-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.117-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.117-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 13:53:56 launchpad ollama[232446]: INFO [main] build info | build=0 commit="unknown" tid="139733190529024" timestamp=1743368036
+Mar 30 13:53:56 launchpad ollama[232446]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139733190529024" timestamp=1743368036 total_threads=16
+Mar 30 13:53:56 launchpad ollama[232446]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43271" tid="139733190529024" timestamp=1743368036
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 13:53:56 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 13:53:56 launchpad ollama[1510]: time=2025-03-30T13:53:56.368-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 13:53:56 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 13:53:56 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 13:53:56 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 13:53:56 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 13:53:56 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 13:53:57 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 13:53:57 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 13:53:57 launchpad ollama[232446]: INFO [main] model loaded | tid="139733190529024" timestamp=1743368037
+Mar 30 13:53:57 launchpad ollama[1510]: time=2025-03-30T13:53:57.372-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 13:53:57 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:53:57 | 200 |   1.70512428s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:53:57 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:53:57 | 200 |  258.535774ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:53:58 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:53:58 | 200 |  818.066094ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 13:54:07 launchpad ollama[1510]: [GIN] 2025/03/30 - 13:54:07 | 200 |  8.317710889s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.156-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9044688896 required="6.2 GiB"
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.157-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.157-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.158-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43753"
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.158-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.158-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.158-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 14:08:30 launchpad ollama[236233]: INFO [main] build info | build=0 commit="unknown" tid="140200415158272" timestamp=1743368910
+Mar 30 14:08:30 launchpad ollama[236233]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140200415158272" timestamp=1743368910 total_threads=16
+Mar 30 14:08:30 launchpad ollama[236233]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43753" tid="140200415158272" timestamp=1743368910
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 14:08:30 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 14:08:30 launchpad ollama[1510]: time=2025-03-30T14:08:30.409-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 14:08:30 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 14:08:30 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 14:08:30 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 14:08:30 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 14:08:30 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 14:08:31 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 14:08:31 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 14:08:31 launchpad ollama[236233]: INFO [main] model loaded | tid="140200415158272" timestamp=1743368911
+Mar 30 14:08:31 launchpad ollama[1510]: time=2025-03-30T14:08:31.414-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 14:08:31 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:08:31 | 200 |  1.752032178s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:08:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:08:32 | 200 |  307.782722ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:08:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:08:33 | 200 |  1.008237057s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:08:41 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:08:41 | 200 |  8.861937194s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.865-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9002483712 required="6.2 GiB"
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.865-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.865-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.866-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41317"
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.867-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.867-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 14:15:39 launchpad ollama[1510]: time=2025-03-30T14:15:39.867-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 14:15:39 launchpad ollama[238117]: INFO [main] build info | build=0 commit="unknown" tid="140401679781888" timestamp=1743369339
+Mar 30 14:15:39 launchpad ollama[238117]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140401679781888" timestamp=1743369339 total_threads=16
+Mar 30 14:15:39 launchpad ollama[238117]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41317" tid="140401679781888" timestamp=1743369339
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 14:15:39 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 14:15:40 launchpad ollama[1510]: time=2025-03-30T14:15:40.118-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 14:15:40 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 14:15:40 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 14:15:40 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 14:15:40 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 14:15:40 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 14:15:40 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 14:15:40 launchpad ollama[238117]: INFO [main] model loaded | tid="140401679781888" timestamp=1743369340
+Mar 30 14:15:41 launchpad ollama[1510]: time=2025-03-30T14:15:41.123-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 14:15:41 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:15:41 | 200 |  1.673962631s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:15:41 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:15:41 | 200 |  214.491787ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:15:42 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:15:42 | 200 |  924.597862ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:15:50 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:15:50 | 200 |  8.197635357s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:20:42 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:20:42 | 200 |  242.350589ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:20:42 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:20:42 | 200 |  219.744923ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:20:43 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:20:43 | 200 |  1.109669094s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:20:53 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:20:53 | 200 |  9.705641534s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.356-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9044688896 required="6.2 GiB"
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.356-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.357-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.358-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43383"
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.358-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.358-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.358-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 14:31:22 launchpad ollama[242210]: INFO [main] build info | build=0 commit="unknown" tid="140680711970816" timestamp=1743370282
+Mar 30 14:31:22 launchpad ollama[242210]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140680711970816" timestamp=1743370282 total_threads=16
+Mar 30 14:31:22 launchpad ollama[242210]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43383" tid="140680711970816" timestamp=1743370282
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 14:31:22 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 14:31:22 launchpad ollama[1510]: time=2025-03-30T14:31:22.609-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 14:31:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 14:31:22 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 14:31:22 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 14:31:22 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 14:31:22 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 14:31:23 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 14:31:23 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 14:31:23 launchpad ollama[242210]: INFO [main] model loaded | tid="140680711970816" timestamp=1743370283
+Mar 30 14:31:23 launchpad ollama[1510]: time=2025-03-30T14:31:23.612-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 14:31:23 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:31:23 | 200 |  1.721336635s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:31:24 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:31:24 | 200 |  227.394231ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:31:25 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:31:25 | 200 |   1.08918971s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:31:35 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:31:35 | 200 | 10.015729494s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.368-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8996454400 required="6.2 GiB"
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.368-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.368-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.369-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38611"
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.370-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.370-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.370-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 14:47:35 launchpad ollama[246746]: INFO [main] build info | build=0 commit="unknown" tid="140516085518336" timestamp=1743371255
+Mar 30 14:47:35 launchpad ollama[246746]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140516085518336" timestamp=1743371255 total_threads=16
+Mar 30 14:47:35 launchpad ollama[246746]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38611" tid="140516085518336" timestamp=1743371255
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 14:47:35 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 14:47:35 launchpad ollama[1510]: time=2025-03-30T14:47:35.621-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 14:47:35 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 14:47:35 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 14:47:35 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 14:47:35 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 14:47:35 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 14:47:36 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 14:47:36 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 14:47:36 launchpad ollama[246746]: INFO [main] model loaded | tid="140516085518336" timestamp=1743371256
+Mar 30 14:47:36 launchpad ollama[1510]: time=2025-03-30T14:47:36.625-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 14:47:38 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:47:38 | 200 |  3.187421732s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:47:39 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:47:39 | 200 |   367.25905ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:47:39 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:47:39 | 200 |  364.747795ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:47:40 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:47:40 | 200 |  1.061008605s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:47:46 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:47:46 | 200 |  6.031404853s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.166-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9006940160 required="6.2 GiB"
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.166-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.8 GiB" free_swap="68.9 GiB"
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.166-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.168-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33717"
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.168-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.168-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.168-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 14:55:02 launchpad ollama[248655]: INFO [main] build info | build=0 commit="unknown" tid="140162681102336" timestamp=1743371702
+Mar 30 14:55:02 launchpad ollama[248655]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140162681102336" timestamp=1743371702 total_threads=16
+Mar 30 14:55:02 launchpad ollama[248655]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33717" tid="140162681102336" timestamp=1743371702
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 14:55:02 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 14:55:02 launchpad ollama[1510]: time=2025-03-30T14:55:02.419-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 14:55:02 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 14:55:02 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 14:55:02 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 14:55:02 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 14:55:02 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 14:55:03 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 14:55:03 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 14:55:03 launchpad ollama[248655]: INFO [main] model loaded | tid="140162681102336" timestamp=1743371703
+Mar 30 14:55:03 launchpad ollama[1510]: time=2025-03-30T14:55:03.424-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 14:55:05 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:55:05 | 200 |  3.476752927s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 14:55:08 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:55:08 | 200 |  206.680476ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:55:08 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:55:08 | 200 |  205.091541ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:55:09 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:55:09 | 200 |  956.182999ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 14:55:16 launchpad ollama[1510]: [GIN] 2025/03/30 - 14:55:16 | 200 |  7.199068594s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.198-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9044688896 required="6.2 GiB"
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.198-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.7 GiB" free_swap="68.9 GiB"
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.198-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.200-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37485"
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.200-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.200-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.200-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 15:04:52 launchpad ollama[251181]: INFO [main] build info | build=0 commit="unknown" tid="139670217154560" timestamp=1743372292
+Mar 30 15:04:52 launchpad ollama[251181]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139670217154560" timestamp=1743372292 total_threads=16
+Mar 30 15:04:52 launchpad ollama[251181]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37485" tid="139670217154560" timestamp=1743372292
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 15:04:52 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 15:04:52 launchpad ollama[1510]: time=2025-03-30T15:04:52.451-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 15:04:52 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 15:04:52 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 15:04:52 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 15:04:52 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 15:04:52 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 15:04:53 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 15:04:53 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 15:04:53 launchpad ollama[251181]: INFO [main] model loaded | tid="139670217154560" timestamp=1743372293
+Mar 30 15:04:53 launchpad ollama[1510]: time=2025-03-30T15:04:53.454-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 15:04:53 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:04:53 | 200 |  1.714657288s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:04:53 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:04:53 | 200 |  266.660072ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:04:54 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:04:54 | 200 |  909.679115ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:05:00 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:05:00 | 200 |  5.889296542s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.464-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9002745856 required="6.2 GiB"
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.464-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.7 GiB" free_swap="68.9 GiB"
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.464-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.465-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42603"
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.466-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.466-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.466-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 15:18:51 launchpad ollama[254756]: INFO [main] build info | build=0 commit="unknown" tid="140247288848384" timestamp=1743373131
+Mar 30 15:18:51 launchpad ollama[254756]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140247288848384" timestamp=1743373131 total_threads=16
+Mar 30 15:18:51 launchpad ollama[254756]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42603" tid="140247288848384" timestamp=1743373131
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 15:18:51 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 15:18:51 launchpad ollama[1510]: time=2025-03-30T15:18:51.717-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 15:18:51 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 15:18:51 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 15:18:51 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 15:18:51 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 15:18:51 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 15:18:52 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 15:18:52 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 15:18:52 launchpad ollama[254756]: INFO [main] model loaded | tid="140247288848384" timestamp=1743373132
+Mar 30 15:18:52 launchpad ollama[1510]: time=2025-03-30T15:18:52.721-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 15:18:53 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:18:53 | 200 |  1.729779035s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:18:53 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:18:53 | 200 |  285.014496ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:18:54 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:18:54 | 200 |  953.831686ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:19:02 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:19:02 | 200 |  7.843429909s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:21:40 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:21:40 | 200 |  203.782143ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:21:41 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:21:41 | 200 |  190.182538ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:21:42 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:21:42 | 200 |  1.162467695s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:21:48 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:21:48 | 200 |    6.0991206s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.003-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9044688896 required="6.2 GiB"
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.003-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.6 GiB" free_swap="68.9 GiB"
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.003-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.004-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39767"
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.005-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.005-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.005-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 15:32:54 launchpad ollama[258356]: INFO [main] build info | build=0 commit="unknown" tid="139839142899712" timestamp=1743373974
+Mar 30 15:32:54 launchpad ollama[258356]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139839142899712" timestamp=1743373974 total_threads=16
+Mar 30 15:32:54 launchpad ollama[258356]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39767" tid="139839142899712" timestamp=1743373974
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 15:32:54 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 15:32:54 launchpad ollama[1510]: time=2025-03-30T15:32:54.255-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 15:32:54 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 15:32:54 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 15:32:54 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 15:32:54 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 15:32:54 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 15:32:54 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 15:32:55 launchpad ollama[258356]: INFO [main] model loaded | tid="139839142899712" timestamp=1743373975
+Mar 30 15:32:55 launchpad ollama[1510]: time=2025-03-30T15:32:55.259-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 15:32:55 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:32:55 | 200 |  1.716980171s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:32:55 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:32:55 | 200 |  257.183863ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:32:56 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:32:56 | 200 |  1.021002209s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:33:04 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:33:04 | 200 |  7.321296408s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.295-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9020899328 required="6.2 GiB"
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.295-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.6 GiB" free_swap="68.9 GiB"
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.295-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.296-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42135"
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.297-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.297-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.297-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 15:40:31 launchpad ollama[260348]: INFO [main] build info | build=0 commit="unknown" tid="140419643985920" timestamp=1743374431
+Mar 30 15:40:31 launchpad ollama[260348]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140419643985920" timestamp=1743374431 total_threads=16
+Mar 30 15:40:31 launchpad ollama[260348]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42135" tid="140419643985920" timestamp=1743374431
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 15:40:31 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 15:40:31 launchpad ollama[1510]: time=2025-03-30T15:40:31.548-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 15:40:31 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 15:40:31 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 15:40:31 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 15:40:31 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 15:40:31 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 15:40:32 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 15:40:32 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 15:40:32 launchpad ollama[260348]: INFO [main] model loaded | tid="140419643985920" timestamp=1743374432
+Mar 30 15:40:32 launchpad ollama[1510]: time=2025-03-30T15:40:32.552-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 15:40:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:40:32 | 200 |  1.677300117s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:40:32 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:40:32 | 200 |  219.154515ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:40:33 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:40:33 | 200 |  998.378093ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:40:41 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:40:41 | 200 |  7.661978973s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.104-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9044688896 required="6.2 GiB"
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.104-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.7 GiB" free_swap="68.9 GiB"
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.104-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.105-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42937"
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.106-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.106-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.106-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 15:50:47 launchpad ollama[263048]: INFO [main] build info | build=0 commit="unknown" tid="140163381395456" timestamp=1743375047
+Mar 30 15:50:47 launchpad ollama[263048]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140163381395456" timestamp=1743375047 total_threads=16
+Mar 30 15:50:47 launchpad ollama[263048]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42937" tid="140163381395456" timestamp=1743375047
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 15:50:47 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 15:50:47 launchpad ollama[1510]: time=2025-03-30T15:50:47.357-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 15:50:47 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 15:50:47 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 15:50:47 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 15:50:47 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 15:50:47 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 15:50:48 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 15:50:48 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 15:50:48 launchpad ollama[263048]: INFO [main] model loaded | tid="140163381395456" timestamp=1743375048
+Mar 30 15:50:48 launchpad ollama[1510]: time=2025-03-30T15:50:48.361-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Mar 30 15:50:48 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:50:48 | 200 |  1.713678009s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:50:48 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:50:48 | 200 |  269.004212ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:50:49 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:50:49 | 200 |  1.020975522s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 15:51:00 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:51:00 | 200 | 10.493888459s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 15:54:54 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:54:54 | 200 |    1.579821ms |       127.0.0.1 | GET      "/api/tags"
+Mar 30 15:54:54 launchpad ollama[1510]: [GIN] 2025/03/30 - 15:54:54 | 200 |      35.176µs |       127.0.0.1 | GET      "/api/version"
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.338-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9026863104 required="6.2 GiB"
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.338-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.338-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.340-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43533"
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.340-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.340-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.340-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 16:02:37 launchpad ollama[266350]: INFO [main] build info | build=0 commit="unknown" tid="140020290863104" timestamp=1743375757
+Mar 30 16:02:37 launchpad ollama[266350]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140020290863104" timestamp=1743375757 total_threads=16
+Mar 30 16:02:37 launchpad ollama[266350]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43533" tid="140020290863104" timestamp=1743375757
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 16:02:37 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 16:02:37 launchpad ollama[1510]: time=2025-03-30T16:02:37.591-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 16:02:37 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 16:02:37 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 16:02:37 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 16:02:37 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 16:02:37 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 16:02:38 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 16:02:38 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 16:02:38 launchpad ollama[266350]: INFO [main] model loaded | tid="140020290863104" timestamp=1743375758
+Mar 30 16:02:38 launchpad ollama[1510]: time=2025-03-30T16:02:38.593-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 16:02:38 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:02:38 | 200 |  1.702624975s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 16:02:39 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:02:39 | 200 |  258.790518ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 16:02:40 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:02:40 | 200 |  1.003156361s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 16:02:51 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:02:51 | 200 |  11.65021323s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.800-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9001697280 required="6.2 GiB"
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.800-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="33.9 GiB" free_swap="68.9 GiB"
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.800-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.801-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1630742596/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46001"
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.801-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.801-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Mar 30 16:11:56 launchpad ollama[1510]: time=2025-03-30T16:11:56.802-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Mar 30 16:11:56 launchpad ollama[268772]: INFO [main] build info | build=0 commit="unknown" tid="140115138469888" timestamp=1743376316
+Mar 30 16:11:56 launchpad ollama[268772]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140115138469888" timestamp=1743376316 total_threads=16
+Mar 30 16:11:56 launchpad ollama[268772]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46001" tid="140115138469888" timestamp=1743376316
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - type  f32:   65 tensors
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - type q4_0:  225 tensors
+Mar 30 16:11:56 launchpad ollama[1510]: llama_model_loader: - type q6_K:    1 tensors
+Mar 30 16:11:57 launchpad ollama[1510]: time=2025-03-30T16:11:57.053-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_vocab: special tokens cache size = 256
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: format           = GGUF V3 (latest)
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: arch             = llama
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: vocab type       = BPE
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_vocab          = 128256
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_merges         = 280147
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: vocab_only       = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_ctx_train      = 8192
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_embd           = 4096
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_layer          = 32
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_head           = 32
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_head_kv        = 8
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_rot            = 128
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_swa            = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_k    = 128
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_embd_head_v    = 128
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_gqa            = 4
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_ff             = 14336
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_expert         = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_expert_used    = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: causal attn      = 1
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: pooling type     = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: rope type        = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: rope scaling     = linear
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: freq_base_train  = 500000.0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: freq_scale_train = 1
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: rope_finetuned   = unknown
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: ssm_d_conv       = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: ssm_d_inner      = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: ssm_d_state      = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_rank      = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: model type       = 8B
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: model ftype      = Q4_0
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: model params     = 8.03 B
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: LF token         = 128 'Ä'
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_print_meta: max token length = 256
+Mar 30 16:11:57 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Mar 30 16:11:57 launchpad ollama[1510]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Mar 30 16:11:57 launchpad ollama[1510]: ggml_cuda_init: found 1 CUDA devices:
+Mar 30 16:11:57 launchpad ollama[1510]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_tensors: offloading 32 repeating layers to GPU
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_tensors: offloading non-repeating layers to GPU
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_tensors: offloaded 33/33 layers to GPU
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: n_ctx      = 8192
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: n_batch    = 512
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: n_ubatch   = 512
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: flash_attn = 0
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: freq_base  = 500000.0
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: freq_scale = 1
+Mar 30 16:11:57 launchpad ollama[1510]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: graph nodes  = 1030
+Mar 30 16:11:57 launchpad ollama[1510]: llama_new_context_with_model: graph splits = 2
+Mar 30 16:11:57 launchpad ollama[268772]: INFO [main] model loaded | tid="140115138469888" timestamp=1743376317
+Mar 30 16:11:58 launchpad ollama[1510]: time=2025-03-30T16:11:58.056-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Mar 30 16:11:58 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:11:58 | 200 |  1.667946555s |       127.0.0.1 | POST     "/api/embed"
+Mar 30 16:11:58 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:11:58 | 200 |   218.09716ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 16:11:59 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:11:59 | 200 |  992.509671ms |       127.0.0.1 | POST     "/api/embed"
+Mar 30 16:12:07 launchpad ollama[1510]: [GIN] 2025/03/30 - 16:12:07 | 200 |  8.252179407s |       127.0.0.1 | POST     "/api/chat"
+Mar 30 16:17:07 launchpad ollama[1510]: cuda driver library failed to get device context 2time=2025-03-30T16:17:07.940-07:00 level=WARN source=gpu.go:400 msg="error looking up nvidia GPU memory"
+Mar 30 17:37:01 launchpad ollama[1510]: [GIN] 2025/03/30 - 17:37:01 | 200 |     576.005µs |       127.0.0.1 | GET      "/api/tags"
+Mar 30 17:37:01 launchpad ollama[1510]: [GIN] 2025/03/30 - 17:37:01 | 200 |      27.907µs |       127.0.0.1 | GET      "/api/version"
+Apr 02 12:13:56 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 12:13:56 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 12:13:56 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 12:13:56 launchpad systemd[1]: ollama.service: Consumed 31min 47.435s CPU time, 6G memory peak, 4K memory swap peak, 4.3G read from disk, 5G written to disk.
+-- Boot a52400493ab945b0aa668c54f0abba5c --
+Apr 02 12:14:30 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 12:14:30 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 12:14:31 launchpad ollama[1515]: 2025/04/02 12:14:31 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 12:14:31 launchpad ollama[1515]: time=2025-04-02T12:14:31.078-07:00 level=INFO source=images.go:753 msg="total blobs: 34"
+Apr 02 12:14:31 launchpad ollama[1515]: time=2025-04-02T12:14:31.089-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 1"
+Apr 02 12:14:31 launchpad ollama[1515]: time=2025-04-02T12:14:31.091-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 12:14:31 launchpad ollama[1515]: time=2025-04-02T12:14:31.093-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2075395292/runners
+Apr 02 12:14:34 launchpad ollama[1515]: time=2025-04-02T12:14:34.067-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Apr 02 12:14:34 launchpad ollama[1515]: time=2025-04-02T12:14:34.068-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 12:14:34 launchpad ollama[1515]: time=2025-04-02T12:14:34.068-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:14:34 launchpad ollama[1515]: time=2025-04-02T12:14:34.069-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:14:34 launchpad ollama[1515]: time=2025-04-02T12:14:34.069-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:14:34 launchpad ollama[1515]: time=2025-04-02T12:14:34.303-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 02 12:18:17 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 12:18:17 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 12:18:17 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 12:18:17 launchpad systemd[1]: ollama.service: Consumed 3.402s CPU time, 787.1M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot c63e1f966157468781dc04a3cb210db4 --
+Apr 02 12:18:54 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 12:18:54 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 12:18:54 launchpad ollama[1515]: 2025/04/02 12:18:54 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 12:18:54 launchpad ollama[1515]: time=2025-04-02T12:18:54.676-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 02 12:18:54 launchpad ollama[1515]: time=2025-04-02T12:18:54.687-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 02 12:18:54 launchpad ollama[1515]: time=2025-04-02T12:18:54.689-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 12:18:54 launchpad ollama[1515]: time=2025-04-02T12:18:54.690-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1727149981/runners
+Apr 02 12:18:57 launchpad ollama[1515]: time=2025-04-02T12:18:57.682-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 02 12:18:57 launchpad ollama[1515]: time=2025-04-02T12:18:57.683-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 12:18:57 launchpad ollama[1515]: time=2025-04-02T12:18:57.684-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:18:57 launchpad ollama[1515]: time=2025-04-02T12:18:57.684-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:18:57 launchpad ollama[1515]: time=2025-04-02T12:18:57.684-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:19:05 launchpad ollama[1515]: time=2025-04-02T12:19:05.616-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 02 12:20:44 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 12:20:45 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 12:20:45 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 12:20:45 launchpad systemd[1]: ollama.service: Consumed 11.356s CPU time, 787.2M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 260ec0b4bc2a4473a8dfa956ac0f6f03 --
+Apr 02 12:21:20 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 12:21:20 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 12:21:20 launchpad ollama[1509]: 2025/04/02 12:21:20 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 12:21:21 launchpad ollama[1509]: time=2025-04-02T12:21:21.003-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 02 12:21:21 launchpad ollama[1509]: time=2025-04-02T12:21:21.015-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 02 12:21:21 launchpad ollama[1509]: time=2025-04-02T12:21:21.017-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 12:21:21 launchpad ollama[1509]: time=2025-04-02T12:21:21.018-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3567153864/runners
+Apr 02 12:21:24 launchpad ollama[1509]: time=2025-04-02T12:21:24.001-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 02 12:21:24 launchpad ollama[1509]: time=2025-04-02T12:21:24.003-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 12:21:24 launchpad ollama[1509]: time=2025-04-02T12:21:24.003-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:21:24 launchpad ollama[1509]: time=2025-04-02T12:21:24.004-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:21:24 launchpad ollama[1509]: time=2025-04-02T12:21:24.004-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:21:24 launchpad ollama[1509]: time=2025-04-02T12:21:24.234-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 02 12:37:00 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 12:37:00 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 12:37:00 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 12:37:00 launchpad systemd[1]: ollama.service: Consumed 3.421s CPU time, 787.5M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 173713d559934faf95ac519561197f5f --
+Apr 02 12:37:32 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 12:37:32 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 12:37:32 launchpad ollama[1512]: 2025/04/02 12:37:32 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 12:37:32 launchpad ollama[1512]: time=2025-04-02T12:37:32.497-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 02 12:37:32 launchpad ollama[1512]: time=2025-04-02T12:37:32.507-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 02 12:37:32 launchpad ollama[1512]: time=2025-04-02T12:37:32.508-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 12:37:32 launchpad ollama[1512]: time=2025-04-02T12:37:32.511-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1619976437/runners
+Apr 02 12:37:35 launchpad ollama[1512]: time=2025-04-02T12:37:35.554-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 02 12:37:35 launchpad ollama[1512]: time=2025-04-02T12:37:35.555-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 12:37:35 launchpad ollama[1512]: time=2025-04-02T12:37:35.555-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:37:35 launchpad ollama[1512]: time=2025-04-02T12:37:35.556-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:37:35 launchpad ollama[1512]: time=2025-04-02T12:37:35.556-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 12:37:35 launchpad ollama[1512]: time=2025-04-02T12:37:35.770-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 02 13:05:03 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 13:05:03 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 13:05:03 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 13:05:03 launchpad systemd[1]: ollama.service: Consumed 3.489s CPU time, 787.1M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot e700071ab6664809b3f524d7cf5fb9ac --
+Apr 02 13:05:37 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 13:05:37 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 13:05:37 launchpad ollama[1556]: 2025/04/02 13:05:37 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 13:05:37 launchpad ollama[1556]: time=2025-04-02T13:05:37.349-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 02 13:05:37 launchpad ollama[1556]: time=2025-04-02T13:05:37.359-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 02 13:05:37 launchpad ollama[1556]: time=2025-04-02T13:05:37.362-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 13:05:37 launchpad ollama[1556]: time=2025-04-02T13:05:37.364-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2881439680/runners
+Apr 02 13:05:40 launchpad ollama[1556]: time=2025-04-02T13:05:40.399-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 02 13:05:40 launchpad ollama[1556]: time=2025-04-02T13:05:40.400-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 13:05:40 launchpad ollama[1556]: time=2025-04-02T13:05:40.400-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 13:05:40 launchpad ollama[1556]: time=2025-04-02T13:05:40.400-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 13:05:40 launchpad ollama[1556]: time=2025-04-02T13:05:40.400-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 13:05:40 launchpad ollama[1556]: time=2025-04-02T13:05:40.631-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 02 13:27:38 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 13:27:38 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 13:27:38 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 13:27:38 launchpad systemd[1]: ollama.service: Consumed 3.487s CPU time, 788.2M memory peak, 234.6M read from disk, 508.1M written to disk.
+-- Boot f51e83401288435294bac45db37486d6 --
+Apr 02 13:28:13 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 13:28:13 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 13:28:14 launchpad ollama[1547]: 2025/04/02 13:28:14 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 13:28:14 launchpad ollama[1547]: time=2025-04-02T13:28:14.091-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 02 13:28:14 launchpad ollama[1547]: time=2025-04-02T13:28:14.099-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 02 13:28:14 launchpad ollama[1547]: time=2025-04-02T13:28:14.100-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 13:28:14 launchpad ollama[1547]: time=2025-04-02T13:28:14.102-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3883982625/runners
+Apr 02 13:28:17 launchpad ollama[1547]: time=2025-04-02T13:28:17.071-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 02 13:28:17 launchpad ollama[1547]: time=2025-04-02T13:28:17.072-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 13:28:17 launchpad ollama[1547]: time=2025-04-02T13:28:17.072-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 13:28:17 launchpad ollama[1547]: time=2025-04-02T13:28:17.073-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 13:28:17 launchpad ollama[1547]: time=2025-04-02T13:28:17.073-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 13:28:17 launchpad ollama[1547]: time=2025-04-02T13:28:17.305-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 02 14:00:56 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 02 14:00:56 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 02 14:00:56 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 02 14:00:56 launchpad systemd[1]: ollama.service: Consumed 3.426s CPU time, 787.5M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot cf8eb89e6cc84de0ae5317ce0068970d --
+Apr 02 14:01:27 launchpad systemd[1]: Starting Server for local large language models...
+Apr 02 14:01:27 launchpad systemd[1]: Started Server for local large language models.
+Apr 02 14:01:27 launchpad ollama[1545]: 2025/04/02 14:01:27 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 02 14:01:27 launchpad ollama[1545]: time=2025-04-02T14:01:27.831-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 02 14:01:27 launchpad ollama[1545]: time=2025-04-02T14:01:27.844-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 02 14:01:27 launchpad ollama[1545]: time=2025-04-02T14:01:27.846-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 02 14:01:27 launchpad ollama[1545]: time=2025-04-02T14:01:27.849-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3147859582/runners
+Apr 02 14:01:30 launchpad ollama[1545]: time=2025-04-02T14:01:30.822-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 02 14:01:30 launchpad ollama[1545]: time=2025-04-02T14:01:30.824-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 02 14:01:30 launchpad ollama[1545]: time=2025-04-02T14:01:30.824-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 14:01:30 launchpad ollama[1545]: time=2025-04-02T14:01:30.825-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 14:01:30 launchpad ollama[1545]: time=2025-04-02T14:01:30.825-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 02 14:01:31 launchpad ollama[1545]: time=2025-04-02T14:01:31.062-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 03 10:57:05 launchpad ollama[1545]: [GIN] 2025/04/03 - 10:57:05 | 200 |     3.99064ms |       127.0.0.1 | GET      "/api/tags"
+Apr 03 10:57:06 launchpad ollama[1545]: [GIN] 2025/04/03 - 10:57:06 | 200 |     620.572µs |       127.0.0.1 | GET      "/api/tags"
+Apr 03 10:57:06 launchpad ollama[1545]: [GIN] 2025/04/03 - 10:57:06 | 200 |     577.383µs |       127.0.0.1 | GET      "/api/version"
+-- Boot 85ddb90a318d4b65b30a92b1d80fef3e --
+Apr 08 10:18:56 launchpad systemd[1]: Starting Server for local large language models...
+Apr 08 10:18:56 launchpad systemd[1]: Started Server for local large language models.
+Apr 08 10:18:56 launchpad ollama[1579]: 2025/04/08 10:18:56 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 08 10:18:56 launchpad ollama[1579]: time=2025-04-08T10:18:56.380-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 08 10:18:56 launchpad ollama[1579]: time=2025-04-08T10:18:56.396-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 08 10:18:56 launchpad ollama[1579]: time=2025-04-08T10:18:56.399-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 08 10:18:56 launchpad ollama[1579]: time=2025-04-08T10:18:56.401-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1454344761/runners
+Apr 08 10:19:04 launchpad ollama[1579]: time=2025-04-08T10:19:04.330-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 08 10:19:04 launchpad ollama[1579]: time=2025-04-08T10:19:04.331-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 08 10:19:04 launchpad ollama[1579]: time=2025-04-08T10:19:04.331-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 10:19:04 launchpad ollama[1579]: time=2025-04-08T10:19:04.332-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 10:19:04 launchpad ollama[1579]: time=2025-04-08T10:19:04.332-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 10:19:04 launchpad ollama[1579]: time=2025-04-08T10:19:04.573-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 08 17:20:02 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 08 17:20:02 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 08 17:20:02 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 08 17:20:02 launchpad systemd[1]: ollama.service: Consumed 4.139s CPU time, 786.8M memory peak, 234.2M read from disk, 508.1M written to disk.
+Apr 08 17:20:07 launchpad systemd[1]: Starting Server for local large language models...
+Apr 08 17:20:07 launchpad systemd[1]: Started Server for local large language models.
+Apr 08 17:20:07 launchpad ollama[474753]: 2025/04/08 17:20:07 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 08 17:20:07 launchpad ollama[474753]: time=2025-04-08T17:20:07.723-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 08 17:20:07 launchpad ollama[474753]: time=2025-04-08T17:20:07.727-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 08 17:20:07 launchpad ollama[474753]: time=2025-04-08T17:20:07.729-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 08 17:20:07 launchpad ollama[474753]: time=2025-04-08T17:20:07.730-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2879337735/runners
+Apr 08 17:20:10 launchpad ollama[474753]: time=2025-04-08T17:20:10.910-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 08 17:20:10 launchpad ollama[474753]: time=2025-04-08T17:20:10.911-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 08 17:20:10 launchpad ollama[474753]: time=2025-04-08T17:20:10.911-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 17:20:10 launchpad ollama[474753]: time=2025-04-08T17:20:10.911-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 17:20:10 launchpad ollama[474753]: time=2025-04-08T17:20:10.911-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 17:20:11 launchpad ollama[474753]: time=2025-04-08T17:20:11.142-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="8.9 GiB"
+Apr 08 17:22:12 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 08 17:22:13 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 08 17:22:13 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 08 17:22:13 launchpad systemd[1]: ollama.service: Consumed 3.653s CPU time, 775.7M memory peak, 223.1M read from disk, 508.1M written to disk.
+-- Boot 553c6e5aa616434a99d28581f60a0555 --
+Apr 08 17:22:46 launchpad systemd[1]: Starting Server for local large language models...
+Apr 08 17:22:47 launchpad systemd[1]: Started Server for local large language models.
+Apr 08 17:22:47 launchpad ollama[1543]: 2025/04/08 17:22:47 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 08 17:22:47 launchpad ollama[1543]: time=2025-04-08T17:22:47.583-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 08 17:22:47 launchpad ollama[1543]: time=2025-04-08T17:22:47.593-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 08 17:22:47 launchpad ollama[1543]: time=2025-04-08T17:22:47.594-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 08 17:22:47 launchpad ollama[1543]: time=2025-04-08T17:22:47.596-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3672531136/runners
+Apr 08 17:22:50 launchpad ollama[1543]: time=2025-04-08T17:22:50.636-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 08 17:22:50 launchpad ollama[1543]: time=2025-04-08T17:22:50.637-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 08 17:22:50 launchpad ollama[1543]: time=2025-04-08T17:22:50.637-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 17:22:50 launchpad ollama[1543]: time=2025-04-08T17:22:50.637-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 17:22:50 launchpad ollama[1543]: time=2025-04-08T17:22:50.637-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 08 17:22:50 launchpad ollama[1543]: time=2025-04-08T17:22:50.855-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 08 17:25:11 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 08 17:25:11 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 08 17:25:11 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 08 17:25:11 launchpad systemd[1]: ollama.service: Consumed 3.477s CPU time, 786.7M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot 8d49b51a62474ba0916c4bd79a5b6d43 --
+Apr 09 09:13:24 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 09:13:24 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 09:13:24 launchpad ollama[1593]: 2025/04/09 09:13:24 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 09:13:24 launchpad ollama[1593]: time=2025-04-09T09:13:24.554-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 09:13:24 launchpad ollama[1593]: time=2025-04-09T09:13:24.566-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 09:13:24 launchpad ollama[1593]: time=2025-04-09T09:13:24.568-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 09:13:24 launchpad ollama[1593]: time=2025-04-09T09:13:24.570-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama692572085/runners
+Apr 09 09:13:27 launchpad ollama[1593]: time=2025-04-09T09:13:27.685-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Apr 09 09:13:27 launchpad ollama[1593]: time=2025-04-09T09:13:27.685-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 09:13:27 launchpad ollama[1593]: time=2025-04-09T09:13:27.686-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:13:27 launchpad ollama[1593]: time=2025-04-09T09:13:27.686-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:13:27 launchpad ollama[1593]: time=2025-04-09T09:13:27.686-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:25:11 launchpad systemd[1]: Stopping Server for local large language models...
+-- Boot c0edc8fdb2134a4e89eaaf392e7ef33e --
+Apr 09 09:27:15 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 09:27:15 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 09:27:15 launchpad ollama[1548]: 2025/04/09 09:27:15 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 09:27:15 launchpad ollama[1548]: time=2025-04-09T09:27:15.346-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 09:27:15 launchpad ollama[1548]: time=2025-04-09T09:27:15.355-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 09:27:15 launchpad ollama[1548]: time=2025-04-09T09:27:15.355-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 09:27:15 launchpad ollama[1548]: time=2025-04-09T09:27:15.357-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3891515531/runners
+Apr 09 09:27:18 launchpad ollama[1548]: time=2025-04-09T09:27:18.381-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 09:27:18 launchpad ollama[1548]: time=2025-04-09T09:27:18.382-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 09:27:18 launchpad ollama[1548]: time=2025-04-09T09:27:18.382-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:27:18 launchpad ollama[1548]: time=2025-04-09T09:27:18.383-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:27:18 launchpad ollama[1548]: time=2025-04-09T09:27:18.383-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:27:18 launchpad ollama[1548]: time=2025-04-09T09:27:18.602-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 09:40:38 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 09:40:38 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 09:40:38 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 09:40:38 launchpad systemd[1]: ollama.service: Consumed 3.467s CPU time, 787.5M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 69a7e7ab69354669831fa63c8712a136 --
+Apr 09 09:41:09 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 09:41:10 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 09:41:10 launchpad ollama[1546]: 2025/04/09 09:41:10 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 09:41:10 launchpad ollama[1546]: time=2025-04-09T09:41:10.156-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 09:41:10 launchpad ollama[1546]: time=2025-04-09T09:41:10.166-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 09:41:10 launchpad ollama[1546]: time=2025-04-09T09:41:10.167-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 09:41:10 launchpad ollama[1546]: time=2025-04-09T09:41:10.169-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2258545132/runners
+Apr 09 09:41:13 launchpad ollama[1546]: time=2025-04-09T09:41:13.200-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 09 09:41:13 launchpad ollama[1546]: time=2025-04-09T09:41:13.200-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 09:41:13 launchpad ollama[1546]: time=2025-04-09T09:41:13.200-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:41:13 launchpad ollama[1546]: time=2025-04-09T09:41:13.201-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:41:13 launchpad ollama[1546]: time=2025-04-09T09:41:13.201-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 09:41:13 launchpad ollama[1546]: time=2025-04-09T09:41:13.415-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 09:41:24 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 09:41:24 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 09:41:24 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 09:41:24 launchpad systemd[1]: ollama.service: Consumed 3.471s CPU time, 786.9M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 4610dee92c8648ad8a9fc7a26d1a4fde --
+Apr 09 11:10:55 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 11:10:55 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 11:10:55 launchpad ollama[1628]: 2025/04/09 11:10:55 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 11:10:55 launchpad ollama[1628]: time=2025-04-09T11:10:55.994-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 11:10:56 launchpad ollama[1628]: time=2025-04-09T11:10:56.002-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 11:10:56 launchpad ollama[1628]: time=2025-04-09T11:10:56.003-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 11:10:56 launchpad ollama[1628]: time=2025-04-09T11:10:56.005-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1412616515/runners
+Apr 09 11:10:59 launchpad ollama[1628]: time=2025-04-09T11:10:59.022-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 11:10:59 launchpad ollama[1628]: time=2025-04-09T11:10:59.023-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 11:10:59 launchpad ollama[1628]: time=2025-04-09T11:10:59.023-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 11:10:59 launchpad ollama[1628]: time=2025-04-09T11:10:59.023-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 11:10:59 launchpad ollama[1628]: time=2025-04-09T11:10:59.023-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 11:10:59 launchpad ollama[1628]: time=2025-04-09T11:10:59.245-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 10:33:03 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:33:03 | 200 |      3.9112ms |       127.0.0.1 | GET      "/api/tags"
+Apr 09 10:33:03 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:33:03 | 200 |     615.805µs |       127.0.0.1 | GET      "/api/tags"
+Apr 09 10:33:03 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:33:03 | 200 |     567.133µs |       127.0.0.1 | GET      "/api/version"
+Apr 09 10:33:12 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:33:12 | 200 |      26.861µs |       127.0.0.1 | GET      "/api/version"
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.550-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.697-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10101260288 required="9.2 GiB"
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.697-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.6 GiB" free_swap="68.9 GiB"
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.698-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.699-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1412616515/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 39735"
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.699-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.699-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.700-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 09 10:34:21 launchpad ollama[8831]: INFO [main] build info | build=0 commit="unknown" tid="140624111030272" timestamp=1744220061
+Apr 09 10:34:21 launchpad ollama[8831]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140624111030272" timestamp=1744220061 total_threads=16
+Apr 09 10:34:21 launchpad ollama[8831]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39735" tid="140624111030272" timestamp=1744220061
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - type  f32:   81 tensors
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - type q4_0:  281 tensors
+Apr 09 10:34:21 launchpad ollama[1628]: llama_model_loader: - type q6_K:    1 tensors
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_vocab: special tokens cache size = 3
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: format           = GGUF V2
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: arch             = llama
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: vocab type       = SPM
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_vocab          = 32016
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_merges         = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: vocab_only       = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_embd           = 5120
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_layer          = 40
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_head           = 40
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_head_kv        = 40
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_rot            = 128
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_swa            = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_gqa            = 1
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_ff             = 13824
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_expert         = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_expert_used    = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: causal attn      = 1
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: pooling type     = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: rope type        = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: rope scaling     = linear
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: freq_scale_train = 1
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: ssm_d_state      = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: model type       = 13B
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: model ftype      = Q4_0
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: model params     = 13.02 B
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: general.name     = codellama
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: BOS token        = 1 ''
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: EOS token        = 2 ''
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: UNK token        = 0 ''
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_print_meta: max token length = 48
+Apr 09 10:34:21 launchpad ollama[1628]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 09 10:34:21 launchpad ollama[1628]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 09 10:34:21 launchpad ollama[1628]: ggml_cuda_init: found 1 CUDA devices:
+Apr 09 10:34:21 launchpad ollama[1628]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 09 10:34:21 launchpad ollama[1628]: time=2025-04-09T10:34:21.951-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 09 10:34:21 launchpad ollama[1628]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 09 10:34:29 launchpad ollama[1628]: llm_load_tensors: offloading 40 repeating layers to GPU
+Apr 09 10:34:29 launchpad ollama[1628]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 09 10:34:29 launchpad ollama[1628]: llm_load_tensors: offloaded 41/41 layers to GPU
+Apr 09 10:34:29 launchpad ollama[1628]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Apr 09 10:34:29 launchpad ollama[1628]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: n_ctx      = 2048
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: n_batch    = 512
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: n_ubatch   = 512
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: flash_attn = 0
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: freq_scale = 1
+Apr 09 10:34:30 launchpad ollama[1628]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: graph nodes  = 1286
+Apr 09 10:34:30 launchpad ollama[1628]: llama_new_context_with_model: graph splits = 2
+Apr 09 10:34:30 launchpad ollama[8831]: INFO [main] model loaded | tid="140624111030272" timestamp=1744220070
+Apr 09 10:34:30 launchpad ollama[1628]: time=2025-04-09T10:34:30.727-07:00 level=INFO source=server.go:626 msg="llama runner started in 9.03 seconds"
+Apr 09 10:34:41 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:34:41 | 200 | 19.792110296s |       127.0.0.1 | POST     "/api/chat"
+Apr 09 10:34:41 launchpad ollama[1628]: time=2025-04-09T10:34:41.398-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 09 10:34:42 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:34:42 | 200 |  1.144799868s |       127.0.0.1 | POST     "/api/chat"
+Apr 09 10:34:42 launchpad ollama[1628]: time=2025-04-09T10:34:42.577-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 09 10:34:48 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:34:48 | 200 |  6.182628008s |       127.0.0.1 | POST     "/api/chat"
+Apr 09 10:38:02 launchpad ollama[1628]: time=2025-04-09T10:38:02.471-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 09 10:38:17 launchpad ollama[1628]: [GIN] 2025/04/09 - 10:38:17 | 200 | 15.421078589s |       127.0.0.1 | POST     "/api/chat"
+Apr 09 12:10:22 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 12:10:22 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 12:10:22 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 12:10:22 launchpad systemd[1]: ollama.service: Consumed 39.260s CPU time, 7.9G memory peak, 7.1G read from disk, 508.1M written to disk.
+-- Boot 679bf54354244e11892dcfefa494c9e4 --
+Apr 09 12:11:08 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 12:11:08 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 12:11:08 launchpad ollama[1544]: 2025/04/09 12:11:08 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 12:11:08 launchpad ollama[1544]: time=2025-04-09T12:11:08.176-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 12:11:08 launchpad ollama[1544]: time=2025-04-09T12:11:08.184-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 12:11:08 launchpad ollama[1544]: time=2025-04-09T12:11:08.185-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 12:11:08 launchpad ollama[1544]: time=2025-04-09T12:11:08.187-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2862335710/runners
+Apr 09 12:11:11 launchpad ollama[1544]: time=2025-04-09T12:11:11.229-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 09 12:11:11 launchpad ollama[1544]: time=2025-04-09T12:11:11.229-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 12:11:11 launchpad ollama[1544]: time=2025-04-09T12:11:11.229-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:11:11 launchpad ollama[1544]: time=2025-04-09T12:11:11.230-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:11:11 launchpad ollama[1544]: time=2025-04-09T12:11:11.230-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:11:11 launchpad ollama[1544]: time=2025-04-09T12:11:11.443-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 12:16:16 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 12:16:16 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 12:16:16 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 12:16:16 launchpad systemd[1]: ollama.service: Consumed 3.478s CPU time, 787.1M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 7c4ea448b8b34058ab6d95569f0a0445 --
+Apr 09 12:16:53 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 12:16:53 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 12:16:53 launchpad ollama[1553]: 2025/04/09 12:16:53 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 12:16:53 launchpad ollama[1553]: time=2025-04-09T12:16:53.865-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 12:16:53 launchpad ollama[1553]: time=2025-04-09T12:16:53.875-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 12:16:53 launchpad ollama[1553]: time=2025-04-09T12:16:53.876-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 12:16:53 launchpad ollama[1553]: time=2025-04-09T12:16:53.878-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3780608611/runners
+Apr 09 12:16:56 launchpad ollama[1553]: time=2025-04-09T12:16:56.861-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 09 12:16:56 launchpad ollama[1553]: time=2025-04-09T12:16:56.862-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 12:16:56 launchpad ollama[1553]: time=2025-04-09T12:16:56.862-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:16:56 launchpad ollama[1553]: time=2025-04-09T12:16:56.863-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:16:56 launchpad ollama[1553]: time=2025-04-09T12:16:56.863-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:16:58 launchpad ollama[1553]: time=2025-04-09T12:16:58.178-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 09 12:17:15 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 12:17:15 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 12:17:15 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 12:17:15 launchpad systemd[1]: ollama.service: Consumed 5.345s CPU time, 787.9M memory peak, 234.4M read from disk, 508.1M written to disk.
+-- Boot 7042a4bf34f746e0ae0b9b7c8b0372a8 --
+Apr 09 12:17:47 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 12:17:47 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 12:17:47 launchpad ollama[1552]: 2025/04/09 12:17:47 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 12:17:47 launchpad ollama[1552]: time=2025-04-09T12:17:47.381-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 12:17:47 launchpad ollama[1552]: time=2025-04-09T12:17:47.393-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 12:17:47 launchpad ollama[1552]: time=2025-04-09T12:17:47.394-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 12:17:47 launchpad ollama[1552]: time=2025-04-09T12:17:47.396-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1680189345/runners
+Apr 09 12:17:50 launchpad ollama[1552]: time=2025-04-09T12:17:50.385-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 12:17:50 launchpad ollama[1552]: time=2025-04-09T12:17:50.386-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 12:17:50 launchpad ollama[1552]: time=2025-04-09T12:17:50.386-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:17:50 launchpad ollama[1552]: time=2025-04-09T12:17:50.387-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:17:50 launchpad ollama[1552]: time=2025-04-09T12:17:50.387-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 12:17:50 launchpad ollama[1552]: time=2025-04-09T12:17:50.616-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 12:37:35 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 12:37:35 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 12:37:35 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 12:37:35 launchpad systemd[1]: ollama.service: Consumed 3.434s CPU time, 786.9M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot c987ebd8bf4144d78c265f5bb5b9f026 --
+Apr 09 13:48:22 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 13:48:22 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 13:48:23 launchpad ollama[1555]: 2025/04/09 13:48:23 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 13:48:23 launchpad ollama[1555]: time=2025-04-09T13:48:23.041-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 13:48:23 launchpad ollama[1555]: time=2025-04-09T13:48:23.052-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 13:48:23 launchpad ollama[1555]: time=2025-04-09T13:48:23.053-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 13:48:23 launchpad ollama[1555]: time=2025-04-09T13:48:23.054-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama613101396/runners
+Apr 09 13:48:26 launchpad ollama[1555]: time=2025-04-09T13:48:26.045-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 13:48:26 launchpad ollama[1555]: time=2025-04-09T13:48:26.046-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 13:48:26 launchpad ollama[1555]: time=2025-04-09T13:48:26.046-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:48:26 launchpad ollama[1555]: time=2025-04-09T13:48:26.046-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:48:26 launchpad ollama[1555]: time=2025-04-09T13:48:26.046-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:48:27 launchpad ollama[1555]: time=2025-04-09T13:48:27.869-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 09 13:48:44 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 13:48:44 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 13:48:44 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 13:48:44 launchpad systemd[1]: ollama.service: Consumed 5.361s CPU time, 786.9M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 88dbd4a34cf44613955445ccd8d61888 --
+Apr 09 13:49:15 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 13:49:16 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 13:49:16 launchpad ollama[1545]: 2025/04/09 13:49:16 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 13:49:16 launchpad ollama[1545]: time=2025-04-09T13:49:16.148-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 13:49:16 launchpad ollama[1545]: time=2025-04-09T13:49:16.157-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 13:49:16 launchpad ollama[1545]: time=2025-04-09T13:49:16.158-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 13:49:16 launchpad ollama[1545]: time=2025-04-09T13:49:16.159-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2274272387/runners
+Apr 09 13:49:19 launchpad ollama[1545]: time=2025-04-09T13:49:19.193-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 13:49:19 launchpad ollama[1545]: time=2025-04-09T13:49:19.194-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 13:49:19 launchpad ollama[1545]: time=2025-04-09T13:49:19.194-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:49:19 launchpad ollama[1545]: time=2025-04-09T13:49:19.195-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:49:19 launchpad ollama[1545]: time=2025-04-09T13:49:19.195-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:49:19 launchpad ollama[1545]: time=2025-04-09T13:49:19.416-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 13:59:14 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 13:59:14 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 13:59:14 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 13:59:14 launchpad systemd[1]: ollama.service: Consumed 3.469s CPU time, 786.9M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot ced04c8de93a40778c928ba8b9f5c2f4 --
+Apr 09 13:59:46 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 13:59:46 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 13:59:46 launchpad ollama[1547]: 2025/04/09 13:59:46 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 13:59:46 launchpad ollama[1547]: time=2025-04-09T13:59:46.233-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 13:59:46 launchpad ollama[1547]: time=2025-04-09T13:59:46.244-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 13:59:46 launchpad ollama[1547]: time=2025-04-09T13:59:46.245-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 13:59:46 launchpad ollama[1547]: time=2025-04-09T13:59:46.246-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama800634150/runners
+Apr 09 13:59:49 launchpad ollama[1547]: time=2025-04-09T13:59:49.230-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 13:59:49 launchpad ollama[1547]: time=2025-04-09T13:59:49.230-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 13:59:49 launchpad ollama[1547]: time=2025-04-09T13:59:49.231-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:59:49 launchpad ollama[1547]: time=2025-04-09T13:59:49.231-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:59:49 launchpad ollama[1547]: time=2025-04-09T13:59:49.231-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 13:59:49 launchpad ollama[1547]: time=2025-04-09T13:59:49.450-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 09 14:15:11 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 09 14:15:11 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 09 14:15:11 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 09 14:15:11 launchpad systemd[1]: ollama.service: Consumed 3.424s CPU time, 786.6M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot fb67aec4a96c42b18e070001ac894f87 --
+Apr 09 14:15:42 launchpad systemd[1]: Starting Server for local large language models...
+Apr 09 14:15:42 launchpad systemd[1]: Started Server for local large language models.
+Apr 09 14:15:42 launchpad ollama[1542]: 2025/04/09 14:15:42 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 09 14:15:42 launchpad ollama[1542]: time=2025-04-09T14:15:42.982-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 09 14:15:42 launchpad ollama[1542]: time=2025-04-09T14:15:42.990-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 09 14:15:42 launchpad ollama[1542]: time=2025-04-09T14:15:42.991-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 09 14:15:42 launchpad ollama[1542]: time=2025-04-09T14:15:42.992-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2499721513/runners
+Apr 09 14:15:45 launchpad ollama[1542]: time=2025-04-09T14:15:45.974-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 09 14:15:45 launchpad ollama[1542]: time=2025-04-09T14:15:45.975-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 09 14:15:45 launchpad ollama[1542]: time=2025-04-09T14:15:45.975-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 14:15:45 launchpad ollama[1542]: time=2025-04-09T14:15:45.975-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 14:15:45 launchpad ollama[1542]: time=2025-04-09T14:15:45.975-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 09 14:15:46 launchpad ollama[1542]: time=2025-04-09T14:15:46.213-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 10 10:06:49 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 10 10:06:49 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 10 10:06:49 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 10 10:06:49 launchpad systemd[1]: ollama.service: Consumed 4.209s CPU time, 786.7M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot e9957a4137514212bb6d6f281b5a62a2 --
+Apr 10 10:08:51 launchpad systemd[1]: Starting Server for local large language models...
+Apr 10 10:08:51 launchpad systemd[1]: Started Server for local large language models.
+Apr 10 10:08:51 launchpad ollama[1577]: 2025/04/10 10:08:51 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 10 10:08:51 launchpad ollama[1577]: time=2025-04-10T10:08:51.571-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 10 10:08:51 launchpad ollama[1577]: time=2025-04-10T10:08:51.581-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 10 10:08:51 launchpad ollama[1577]: time=2025-04-10T10:08:51.582-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 10 10:08:51 launchpad ollama[1577]: time=2025-04-10T10:08:51.584-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3501558433/runners
+Apr 10 10:08:54 launchpad ollama[1577]: time=2025-04-10T10:08:54.506-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 10 10:08:54 launchpad ollama[1577]: time=2025-04-10T10:08:54.507-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 10 10:08:54 launchpad ollama[1577]: time=2025-04-10T10:08:54.508-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 10:08:54 launchpad ollama[1577]: time=2025-04-10T10:08:54.508-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 10:08:54 launchpad ollama[1577]: time=2025-04-10T10:08:54.508-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 10:08:56 launchpad ollama[1577]: time=2025-04-10T10:08:56.220-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 10 10:22:44 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 10 10:22:45 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 10 10:22:45 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 10 10:22:45 launchpad systemd[1]: ollama.service: Consumed 5.248s CPU time, 786.9M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 4618c9f76f3e4a6ab487c026738f643a --
+Apr 10 10:23:20 launchpad systemd[1]: Starting Server for local large language models...
+Apr 10 10:23:20 launchpad systemd[1]: Started Server for local large language models.
+Apr 10 10:23:21 launchpad ollama[1475]: 2025/04/10 10:23:21 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 10 10:23:21 launchpad ollama[1475]: time=2025-04-10T10:23:21.359-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 10 10:23:21 launchpad ollama[1475]: time=2025-04-10T10:23:21.476-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 10 10:23:21 launchpad ollama[1475]: time=2025-04-10T10:23:21.484-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 10 10:23:21 launchpad ollama[1475]: time=2025-04-10T10:23:21.496-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1075261275/runners
+Apr 10 10:23:28 launchpad ollama[1475]: time=2025-04-10T10:23:28.634-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 10 10:23:28 launchpad ollama[1475]: time=2025-04-10T10:23:28.643-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 10 10:23:28 launchpad ollama[1475]: time=2025-04-10T10:23:28.643-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 10:23:28 launchpad ollama[1475]: time=2025-04-10T10:23:28.648-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 10:23:28 launchpad ollama[1475]: time=2025-04-10T10:23:28.648-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 10:23:29 launchpad ollama[1475]: time=2025-04-10T10:23:29.118-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 10 14:47:23 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 10 14:47:23 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 10 14:47:23 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 10 14:47:23 launchpad systemd[1]: ollama.service: Consumed 3.581s CPU time, 788M memory peak, 235.2M read from disk, 508.1M written to disk.
+-- Boot bf8f28edaff24d31aab33845041368de --
+Apr 10 14:48:03 launchpad systemd[1]: Starting Server for local large language models...
+Apr 10 14:48:03 launchpad systemd[1]: Started Server for local large language models.
+Apr 10 14:48:03 launchpad ollama[1765]: 2025/04/10 14:48:03 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 10 14:48:03 launchpad ollama[1765]: time=2025-04-10T14:48:03.302-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 10 14:48:03 launchpad ollama[1765]: time=2025-04-10T14:48:03.311-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 10 14:48:03 launchpad ollama[1765]: time=2025-04-10T14:48:03.312-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 10 14:48:03 launchpad ollama[1765]: time=2025-04-10T14:48:03.314-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2361362427/runners
+Apr 10 14:48:06 launchpad ollama[1765]: time=2025-04-10T14:48:06.393-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 10 14:48:06 launchpad ollama[1765]: time=2025-04-10T14:48:06.393-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 10 14:48:06 launchpad ollama[1765]: time=2025-04-10T14:48:06.394-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 14:48:06 launchpad ollama[1765]: time=2025-04-10T14:48:06.394-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 14:48:06 launchpad ollama[1765]: time=2025-04-10T14:48:06.394-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 14:48:06 launchpad ollama[1765]: time=2025-04-10T14:48:06.624-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 10 15:04:54 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 10 15:04:54 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 10 15:04:54 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 10 15:04:54 launchpad systemd[1]: ollama.service: Consumed 3.479s CPU time, 787.3M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 0cfbb8f023a7402faa52ab71aa7747c6 --
+Apr 10 15:05:26 launchpad systemd[1]: Starting Server for local large language models...
+Apr 10 15:05:26 launchpad systemd[1]: Started Server for local large language models.
+Apr 10 15:05:26 launchpad ollama[1767]: 2025/04/10 15:05:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 10 15:05:26 launchpad ollama[1767]: time=2025-04-10T15:05:26.441-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 10 15:05:26 launchpad ollama[1767]: time=2025-04-10T15:05:26.452-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 10 15:05:26 launchpad ollama[1767]: time=2025-04-10T15:05:26.453-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 10 15:05:26 launchpad ollama[1767]: time=2025-04-10T15:05:26.455-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama428823615/runners
+Apr 10 15:05:29 launchpad ollama[1767]: time=2025-04-10T15:05:29.442-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 10 15:05:29 launchpad ollama[1767]: time=2025-04-10T15:05:29.443-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 10 15:05:29 launchpad ollama[1767]: time=2025-04-10T15:05:29.443-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 15:05:29 launchpad ollama[1767]: time=2025-04-10T15:05:29.443-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 15:05:29 launchpad ollama[1767]: time=2025-04-10T15:05:29.443-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 15:05:29 launchpad ollama[1767]: time=2025-04-10T15:05:29.669-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 10 15:24:23 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 10 15:24:23 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 10 15:24:23 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 10 15:24:23 launchpad systemd[1]: ollama.service: Consumed 3.455s CPU time, 787.1M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot 7ccc3d496da5419f8ae6577671f300b0 --
+Apr 10 15:24:55 launchpad systemd[1]: Starting Server for local large language models...
+Apr 10 15:24:55 launchpad systemd[1]: Started Server for local large language models.
+Apr 10 15:24:56 launchpad ollama[1755]: 2025/04/10 15:24:56 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 10 15:24:56 launchpad ollama[1755]: time=2025-04-10T15:24:56.083-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 10 15:24:56 launchpad ollama[1755]: time=2025-04-10T15:24:56.095-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 10 15:24:56 launchpad ollama[1755]: time=2025-04-10T15:24:56.096-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 10 15:24:56 launchpad ollama[1755]: time=2025-04-10T15:24:56.097-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2840929771/runners
+Apr 10 15:24:59 launchpad ollama[1755]: time=2025-04-10T15:24:59.152-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 10 15:24:59 launchpad ollama[1755]: time=2025-04-10T15:24:59.153-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 10 15:24:59 launchpad ollama[1755]: time=2025-04-10T15:24:59.153-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 15:24:59 launchpad ollama[1755]: time=2025-04-10T15:24:59.153-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 15:24:59 launchpad ollama[1755]: time=2025-04-10T15:24:59.153-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 15:24:59 launchpad ollama[1755]: time=2025-04-10T15:24:59.365-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 10 16:02:18 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 10 16:02:18 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 10 16:02:19 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 10 16:02:19 launchpad systemd[1]: ollama.service: Consumed 3.495s CPU time, 787.9M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 7d0a614ca76f4931b080917f004526fd --
+Apr 10 16:03:00 launchpad systemd[1]: Starting Server for local large language models...
+Apr 10 16:03:00 launchpad systemd[1]: Started Server for local large language models.
+Apr 10 16:03:00 launchpad ollama[1757]: 2025/04/10 16:03:00 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 10 16:03:00 launchpad ollama[1757]: time=2025-04-10T16:03:00.874-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 10 16:03:00 launchpad ollama[1757]: time=2025-04-10T16:03:00.883-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 10 16:03:00 launchpad ollama[1757]: time=2025-04-10T16:03:00.884-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 10 16:03:00 launchpad ollama[1757]: time=2025-04-10T16:03:00.886-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4085095757/runners
+Apr 10 16:03:03 launchpad ollama[1757]: time=2025-04-10T16:03:03.865-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 10 16:03:03 launchpad ollama[1757]: time=2025-04-10T16:03:03.866-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 10 16:03:03 launchpad ollama[1757]: time=2025-04-10T16:03:03.866-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 16:03:03 launchpad ollama[1757]: time=2025-04-10T16:03:03.867-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 16:03:03 launchpad ollama[1757]: time=2025-04-10T16:03:03.867-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 10 16:03:04 launchpad ollama[1757]: time=2025-04-10T16:03:04.089-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 12 11:40:29 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 12 11:40:29 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 12 11:40:29 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 12 11:40:29 launchpad systemd[1]: ollama.service: Consumed 3.777s CPU time, 787.5M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot f5f3826f76534b1db33c34162bde23a7 --
+Apr 12 11:41:10 launchpad systemd[1]: Starting Server for local large language models...
+Apr 12 11:41:10 launchpad systemd[1]: Started Server for local large language models.
+Apr 12 11:41:11 launchpad ollama[1751]: 2025/04/12 11:41:11 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 12 11:41:11 launchpad ollama[1751]: time=2025-04-12T11:41:11.064-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 12 11:41:11 launchpad ollama[1751]: time=2025-04-12T11:41:11.071-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 12 11:41:11 launchpad ollama[1751]: time=2025-04-12T11:41:11.072-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 12 11:41:11 launchpad ollama[1751]: time=2025-04-12T11:41:11.073-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama702236440/runners
+Apr 12 11:41:14 launchpad ollama[1751]: time=2025-04-12T11:41:14.044-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 12 11:41:14 launchpad ollama[1751]: time=2025-04-12T11:41:14.045-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 12 11:41:14 launchpad ollama[1751]: time=2025-04-12T11:41:14.045-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 11:41:14 launchpad ollama[1751]: time=2025-04-12T11:41:14.046-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 11:41:14 launchpad ollama[1751]: time=2025-04-12T11:41:14.046-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 11:41:14 launchpad ollama[1751]: time=2025-04-12T11:41:14.283-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 12 15:50:45 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:50:45 | 200 |    3.751256ms |       127.0.0.1 | GET      "/api/tags"
+Apr 12 15:50:46 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:50:46 | 200 |     768.857µs |       127.0.0.1 | GET      "/api/tags"
+Apr 12 15:50:46 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:50:46 | 200 |     586.742µs |       127.0.0.1 | GET      "/api/version"
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.377-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9205645312 required="6.2 GiB"
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.377-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.378-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.380-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43409"
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.380-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.380-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.381-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 15:51:59 launchpad ollama[89512]: INFO [main] build info | build=0 commit="unknown" tid="139813475696640" timestamp=1744498319
+Apr 12 15:51:59 launchpad ollama[89512]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139813475696640" timestamp=1744498319 total_threads=16
+Apr 12 15:51:59 launchpad ollama[89512]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43409" tid="139813475696640" timestamp=1744498319
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - type  f32:   65 tensors
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - type q4_0:  225 tensors
+Apr 12 15:51:59 launchpad ollama[1751]: llama_model_loader: - type q6_K:    1 tensors
+Apr 12 15:51:59 launchpad ollama[1751]: time=2025-04-12T15:51:59.631-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_0
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 15:51:59 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 15:51:59 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 15:51:59 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 15:51:59 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 15:51:59 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 15:52:04 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 15:52:04 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 15:52:04 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 15:52:04 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 15:52:04 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 15:52:05 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 15:52:05 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 15:52:05 launchpad ollama[89512]: INFO [main] model loaded | tid="139813475696640" timestamp=1744498325
+Apr 12 15:52:05 launchpad ollama[1751]: time=2025-04-12T15:52:05.647-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Apr 12 15:52:05 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:52:05 | 200 |  6.627753307s |       127.0.0.1 | POST     "/api/embed"
+Apr 12 15:52:05 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:52:05 | 200 |  168.769922ms |       127.0.0.1 | POST     "/api/embed"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.187-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.872-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9205514240 required="6.5 GiB"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.872-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.872-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.873-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37237"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.873-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.873-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 15:52:06 launchpad ollama[1751]: time=2025-04-12T15:52:06.873-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 15:52:06 launchpad ollama[89549]: INFO [main] build info | build=0 commit="unknown" tid="140648581529600" timestamp=1744498326
+Apr 12 15:52:06 launchpad ollama[89549]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140648581529600" timestamp=1744498326 total_threads=16
+Apr 12 15:52:06 launchpad ollama[89549]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37237" tid="140648581529600" timestamp=1744498326
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - type  f32:   66 tensors
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - type q4_K:  193 tensors
+Apr 12 15:52:06 launchpad ollama[1751]: llama_model_loader: - type q6_K:   33 tensors
+Apr 12 15:52:07 launchpad ollama[1751]: time=2025-04-12T15:52:07.124-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 15:52:07 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 15:52:07 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 15:52:07 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 15:52:07 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 15:52:07 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 15:52:11 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 15:52:11 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 15:52:11 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 15:52:11 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 15:52:11 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 15:52:12 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 15:52:12 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 15:52:12 launchpad ollama[89549]: INFO [main] model loaded | tid="140648581529600" timestamp=1744498332
+Apr 12 15:52:12 launchpad ollama[1751]: time=2025-04-12T15:52:12.388-07:00 level=INFO source=server.go:626 msg="llama runner started in 5.51 seconds"
+Apr 12 15:52:31 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:52:31 | 200 | 25.176674753s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 15:52:33 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:52:33 | 200 |  1.957238926s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 15:52:38 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:52:38 | 200 |  5.699314963s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 15:56:43 launchpad ollama[1751]: time=2025-04-12T15:56:43.701-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.395-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9218686976 required="6.2 GiB"
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.395-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.396-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.397-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33515"
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.397-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.397-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.397-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 15:56:44 launchpad ollama[90712]: INFO [main] build info | build=0 commit="unknown" tid="139842239279104" timestamp=1744498604
+Apr 12 15:56:44 launchpad ollama[90712]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139842239279104" timestamp=1744498604 total_threads=16
+Apr 12 15:56:44 launchpad ollama[90712]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33515" tid="139842239279104" timestamp=1744498604
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - type  f32:   65 tensors
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - type q4_0:  225 tensors
+Apr 12 15:56:44 launchpad ollama[1751]: llama_model_loader: - type q6_K:    1 tensors
+Apr 12 15:56:44 launchpad ollama[1751]: time=2025-04-12T15:56:44.649-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_0
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 15:56:44 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 15:56:44 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 15:56:44 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 15:56:44 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 15:56:44 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 15:56:45 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 15:56:45 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 15:56:45 launchpad ollama[90712]: INFO [main] model loaded | tid="139842239279104" timestamp=1744498605
+Apr 12 15:56:45 launchpad ollama[1751]: time=2025-04-12T15:56:45.652-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 12 15:56:45 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:56:45 | 200 |  2.323122961s |       127.0.0.1 | POST     "/api/embed"
+Apr 12 15:56:46 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:56:46 | 200 |  190.725431ms |       127.0.0.1 | POST     "/api/embed"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.233-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.957-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9218228224 required="6.5 GiB"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.957-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.957-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.958-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43113"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.958-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.958-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 15:56:46 launchpad ollama[1751]: time=2025-04-12T15:56:46.958-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 15:56:46 launchpad ollama[90780]: INFO [main] build info | build=0 commit="unknown" tid="140338040655872" timestamp=1744498606
+Apr 12 15:56:46 launchpad ollama[90780]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140338040655872" timestamp=1744498606 total_threads=16
+Apr 12 15:56:46 launchpad ollama[90780]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43113" tid="140338040655872" timestamp=1744498606
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - type  f32:   66 tensors
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - type q4_K:  193 tensors
+Apr 12 15:56:47 launchpad ollama[1751]: llama_model_loader: - type q6_K:   33 tensors
+Apr 12 15:56:47 launchpad ollama[1751]: time=2025-04-12T15:56:47.209-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 15:56:47 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 15:56:47 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 15:56:47 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 15:56:47 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 15:56:47 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 15:56:47 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 15:56:48 launchpad ollama[90780]: INFO [main] model loaded | tid="140338040655872" timestamp=1744498608
+Apr 12 15:56:48 launchpad ollama[1751]: time=2025-04-12T15:56:48.213-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 15:57:06 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:57:06 | 200 | 20.745649127s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 15:57:26 launchpad ollama[1751]: time=2025-04-12T15:57:26.810-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.6 GiB"
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.506-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9215475712 required="6.2 GiB"
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.506-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.506-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.507-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34397"
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.507-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.507-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.507-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 15:57:27 launchpad ollama[91152]: INFO [main] build info | build=0 commit="unknown" tid="140132836282368" timestamp=1744498647
+Apr 12 15:57:27 launchpad ollama[91152]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140132836282368" timestamp=1744498647 total_threads=16
+Apr 12 15:57:27 launchpad ollama[91152]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34397" tid="140132836282368" timestamp=1744498647
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - type  f32:   65 tensors
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - type q4_0:  225 tensors
+Apr 12 15:57:27 launchpad ollama[1751]: llama_model_loader: - type q6_K:    1 tensors
+Apr 12 15:57:27 launchpad ollama[1751]: time=2025-04-12T15:57:27.758-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_0
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 15:57:27 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 15:57:27 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 15:57:27 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 15:57:27 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 15:57:27 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 15:57:28 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 15:57:28 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 15:57:28 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 15:57:28 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 15:57:28 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 15:57:28 launchpad ollama[91152]: INFO [main] model loaded | tid="140132836282368" timestamp=1744498648
+Apr 12 15:57:28 launchpad ollama[1751]: time=2025-04-12T15:57:28.761-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 15:57:28 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:57:28 | 200 |   2.30868954s |       127.0.0.1 | POST     "/api/embed"
+Apr 12 15:57:29 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:57:29 | 200 |  173.378365ms |       127.0.0.1 | POST     "/api/embed"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.307-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.989-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9215475712 required="6.5 GiB"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.989-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.989-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.991-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33589"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.991-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.991-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 15:57:29 launchpad ollama[1751]: time=2025-04-12T15:57:29.991-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 15:57:30 launchpad ollama[91183]: INFO [main] build info | build=0 commit="unknown" tid="139925061128192" timestamp=1744498650
+Apr 12 15:57:30 launchpad ollama[91183]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139925061128192" timestamp=1744498650 total_threads=16
+Apr 12 15:57:30 launchpad ollama[91183]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33589" tid="139925061128192" timestamp=1744498650
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - type  f32:   66 tensors
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - type q4_K:  193 tensors
+Apr 12 15:57:30 launchpad ollama[1751]: llama_model_loader: - type q6_K:   33 tensors
+Apr 12 15:57:30 launchpad ollama[1751]: time=2025-04-12T15:57:30.242-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 15:57:30 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 15:57:30 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 15:57:30 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 15:57:30 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 15:57:30 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 15:57:31 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 15:57:31 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 15:57:31 launchpad ollama[91183]: INFO [main] model loaded | tid="139925061128192" timestamp=1744498651
+Apr 12 15:57:31 launchpad ollama[1751]: time=2025-04-12T15:57:31.245-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 15:57:44 launchpad ollama[1751]: [GIN] 2025/04/12 - 15:57:44 | 200 | 14.992532084s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 16:01:00 launchpad ollama[1751]: time=2025-04-12T16:01:00.343-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.5 GiB"
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.033-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9211412480 required="6.2 GiB"
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.033-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.033-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.035-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34717"
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.035-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.035-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.035-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 16:01:01 launchpad ollama[92523]: INFO [main] build info | build=0 commit="unknown" tid="140591286628352" timestamp=1744498861
+Apr 12 16:01:01 launchpad ollama[92523]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140591286628352" timestamp=1744498861 total_threads=16
+Apr 12 16:01:01 launchpad ollama[92523]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34717" tid="140591286628352" timestamp=1744498861
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - type  f32:   65 tensors
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - type q4_0:  225 tensors
+Apr 12 16:01:01 launchpad ollama[1751]: llama_model_loader: - type q6_K:    1 tensors
+Apr 12 16:01:01 launchpad ollama[1751]: time=2025-04-12T16:01:01.286-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_0
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 16:01:01 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 16:01:01 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 16:01:01 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 16:01:01 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 16:01:01 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 16:01:02 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 16:01:02 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 16:01:02 launchpad ollama[92523]: INFO [main] model loaded | tid="140591286628352" timestamp=1744498862
+Apr 12 16:01:02 launchpad ollama[1751]: time=2025-04-12T16:01:02.290-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 16:01:02 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:01:02 | 200 |  2.341767936s |       127.0.0.1 | POST     "/api/embed"
+Apr 12 16:01:02 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:01:02 | 200 |  210.156647ms |       127.0.0.1 | POST     "/api/embed"
+Apr 12 16:01:02 launchpad ollama[1751]: time=2025-04-12T16:01:02.938-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.632-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9212329984 required="6.5 GiB"
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.632-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.633-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.634-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37551"
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.634-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.634-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.634-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 16:01:03 launchpad ollama[92554]: INFO [main] build info | build=0 commit="unknown" tid="140050573516800" timestamp=1744498863
+Apr 12 16:01:03 launchpad ollama[92554]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140050573516800" timestamp=1744498863 total_threads=16
+Apr 12 16:01:03 launchpad ollama[92554]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37551" tid="140050573516800" timestamp=1744498863
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - type  f32:   66 tensors
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - type q4_K:  193 tensors
+Apr 12 16:01:03 launchpad ollama[1751]: llama_model_loader: - type q6_K:   33 tensors
+Apr 12 16:01:03 launchpad ollama[1751]: time=2025-04-12T16:01:03.885-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 16:01:03 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 16:01:03 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 16:01:03 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 16:01:03 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 16:01:03 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 16:01:04 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 16:01:04 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 16:01:04 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 16:01:04 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 16:01:04 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 16:01:04 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 16:01:04 launchpad ollama[92554]: INFO [main] model loaded | tid="140050573516800" timestamp=1744498864
+Apr 12 16:01:04 launchpad ollama[1751]: time=2025-04-12T16:01:04.888-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 16:01:30 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:01:30 | 200 | 27.372628557s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 16:03:01 launchpad ollama[1751]: time=2025-04-12T16:03:01.429-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.5 GiB"
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.141-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9212461056 required="6.2 GiB"
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.141-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.141-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.143-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36733"
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.143-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.143-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.143-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 16:03:02 launchpad ollama[93031]: INFO [main] build info | build=0 commit="unknown" tid="140123403542528" timestamp=1744498982
+Apr 12 16:03:02 launchpad ollama[93031]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140123403542528" timestamp=1744498982 total_threads=16
+Apr 12 16:03:02 launchpad ollama[93031]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36733" tid="140123403542528" timestamp=1744498982
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - type  f32:   65 tensors
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - type q4_0:  225 tensors
+Apr 12 16:03:02 launchpad ollama[1751]: llama_model_loader: - type q6_K:    1 tensors
+Apr 12 16:03:02 launchpad ollama[1751]: time=2025-04-12T16:03:02.394-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_0
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 16:03:02 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 16:03:02 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 16:03:02 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 16:03:02 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 16:03:02 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 16:03:03 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 16:03:03 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 16:03:03 launchpad ollama[93031]: INFO [main] model loaded | tid="140123403542528" timestamp=1744498983
+Apr 12 16:03:03 launchpad ollama[1751]: time=2025-04-12T16:03:03.397-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 16:03:03 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:03:03 | 200 |  2.323360326s |       127.0.0.1 | POST     "/api/embed"
+Apr 12 16:03:03 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:03:03 | 200 |  168.583748ms |       127.0.0.1 | POST     "/api/embed"
+Apr 12 16:03:03 launchpad ollama[1751]: time=2025-04-12T16:03:03.941-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.629-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9212526592 required="6.5 GiB"
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.629-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.629-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.630-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43341"
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.630-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.631-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.631-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 16:03:04 launchpad ollama[93062]: INFO [main] build info | build=0 commit="unknown" tid="139945172643840" timestamp=1744498984
+Apr 12 16:03:04 launchpad ollama[93062]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139945172643840" timestamp=1744498984 total_threads=16
+Apr 12 16:03:04 launchpad ollama[93062]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43341" tid="139945172643840" timestamp=1744498984
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - type  f32:   66 tensors
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - type q4_K:  193 tensors
+Apr 12 16:03:04 launchpad ollama[1751]: llama_model_loader: - type q6_K:   33 tensors
+Apr 12 16:03:04 launchpad ollama[1751]: time=2025-04-12T16:03:04.882-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 16:03:04 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 16:03:04 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 16:03:04 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 16:03:04 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 16:03:04 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 16:03:05 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 16:03:05 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 16:03:05 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 16:03:05 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 16:03:05 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 16:03:05 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 16:03:05 launchpad ollama[93062]: INFO [main] model loaded | tid="139945172643840" timestamp=1744498985
+Apr 12 16:03:05 launchpad ollama[1751]: time=2025-04-12T16:03:05.886-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 12 16:03:27 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:03:27 | 200 | 23.565740742s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 16:04:14 launchpad ollama[1751]: time=2025-04-12T16:04:14.949-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.5 GiB"
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.646-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9212788736 required="6.2 GiB"
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.646-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.647-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.648-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39819"
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.648-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.648-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.649-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 16:04:15 launchpad ollama[93352]: INFO [main] build info | build=0 commit="unknown" tid="140146931527680" timestamp=1744499055
+Apr 12 16:04:15 launchpad ollama[93352]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140146931527680" timestamp=1744499055 total_threads=16
+Apr 12 16:04:15 launchpad ollama[93352]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39819" tid="140146931527680" timestamp=1744499055
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - type  f32:   65 tensors
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - type q4_0:  225 tensors
+Apr 12 16:04:15 launchpad ollama[1751]: llama_model_loader: - type q6_K:    1 tensors
+Apr 12 16:04:15 launchpad ollama[1751]: time=2025-04-12T16:04:15.900-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_0
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 16:04:15 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 16:04:15 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 16:04:15 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 16:04:15 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 16:04:15 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 16:04:16 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 16:04:16 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 16:04:16 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 16:04:16 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 16:04:16 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 16:04:16 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 16:04:16 launchpad ollama[93352]: INFO [main] model loaded | tid="140146931527680" timestamp=1744499056
+Apr 12 16:04:16 launchpad ollama[1751]: time=2025-04-12T16:04:16.903-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 16:04:17 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:04:17 | 200 |  2.305077355s |       127.0.0.1 | POST     "/api/embed"
+Apr 12 16:04:17 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:04:17 | 200 |  169.492759ms |       127.0.0.1 | POST     "/api/embed"
+Apr 12 16:04:17 launchpad ollama[1751]: time=2025-04-12T16:04:17.444-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.145-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9212592128 required="6.5 GiB"
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.145-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.145-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.146-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama702236440/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42925"
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.146-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.146-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.146-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 12 16:04:18 launchpad ollama[93382]: INFO [main] build info | build=0 commit="unknown" tid="140604163182592" timestamp=1744499058
+Apr 12 16:04:18 launchpad ollama[93382]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140604163182592" timestamp=1744499058 total_threads=16
+Apr 12 16:04:18 launchpad ollama[93382]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42925" tid="140604163182592" timestamp=1744499058
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - type  f32:   66 tensors
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - type q4_K:  193 tensors
+Apr 12 16:04:18 launchpad ollama[1751]: llama_model_loader: - type q6_K:   33 tensors
+Apr 12 16:04:18 launchpad ollama[1751]: time=2025-04-12T16:04:18.397-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_vocab: special tokens cache size = 256
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: arch             = llama
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: vocab type       = BPE
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_vocab          = 128256
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_merges         = 280147
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: vocab_only       = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_embd           = 4096
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_layer          = 32
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_head           = 32
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_head_kv        = 8
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_rot            = 128
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_swa            = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_gqa            = 4
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_ff             = 14336
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_expert         = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_expert_used    = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: causal attn      = 1
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: pooling type     = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: rope type        = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: rope scaling     = linear
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: freq_scale_train = 1
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: ssm_d_state      = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: model type       = 8B
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: model params     = 8.03 B
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_print_meta: max token length = 256
+Apr 12 16:04:18 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 12 16:04:18 launchpad ollama[1751]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 12 16:04:18 launchpad ollama[1751]: ggml_cuda_init: found 1 CUDA devices:
+Apr 12 16:04:18 launchpad ollama[1751]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 12 16:04:18 launchpad ollama[1751]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: n_ctx      = 8192
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: n_batch    = 512
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: n_ubatch   = 512
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: flash_attn = 0
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: freq_scale = 1
+Apr 12 16:04:19 launchpad ollama[1751]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: graph nodes  = 1030
+Apr 12 16:04:19 launchpad ollama[1751]: llama_new_context_with_model: graph splits = 2
+Apr 12 16:04:19 launchpad ollama[93382]: INFO [main] model loaded | tid="140604163182592" timestamp=1744499059
+Apr 12 16:04:19 launchpad ollama[1751]: time=2025-04-12T16:04:19.401-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 12 16:04:32 launchpad ollama[1751]: [GIN] 2025/04/12 - 16:04:32 | 200 | 15.566448776s |       127.0.0.1 | POST     "/api/chat"
+Apr 12 18:34:49 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 12 18:34:50 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 12 18:34:50 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 12 18:34:50 launchpad systemd[1]: ollama.service: Consumed 2min 25.493s CPU time, 10.2G memory peak, 9.2G read from disk, 508.1M written to disk.
+-- Boot b745703e6a8e420d9ac385fadf7e88b8 --
+Apr 12 18:35:30 launchpad systemd[1]: Starting Server for local large language models...
+Apr 12 18:35:30 launchpad systemd[1]: Started Server for local large language models.
+Apr 12 18:35:31 launchpad ollama[1752]: 2025/04/12 18:35:31 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 12 18:35:31 launchpad ollama[1752]: time=2025-04-12T18:35:31.028-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 12 18:35:31 launchpad ollama[1752]: time=2025-04-12T18:35:31.036-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 12 18:35:31 launchpad ollama[1752]: time=2025-04-12T18:35:31.038-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 12 18:35:31 launchpad ollama[1752]: time=2025-04-12T18:35:31.039-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama4229267965/runners
+Apr 12 18:35:34 launchpad ollama[1752]: time=2025-04-12T18:35:34.013-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 12 18:35:34 launchpad ollama[1752]: time=2025-04-12T18:35:34.014-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 12 18:35:34 launchpad ollama[1752]: time=2025-04-12T18:35:34.014-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 18:35:34 launchpad ollama[1752]: time=2025-04-12T18:35:34.014-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 18:35:34 launchpad ollama[1752]: time=2025-04-12T18:35:34.014-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 18:35:34 launchpad ollama[1752]: time=2025-04-12T18:35:34.248-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 12 19:07:02 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 12 19:07:02 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 12 19:07:02 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 12 19:07:02 launchpad systemd[1]: ollama.service: Consumed 3.434s CPU time, 786.5M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 2596e88fae2040f4923249aafb7e7712 --
+Apr 12 19:07:43 launchpad systemd[1]: Starting Server for local large language models...
+Apr 12 19:07:44 launchpad systemd[1]: Started Server for local large language models.
+Apr 12 19:07:44 launchpad ollama[1757]: 2025/04/12 19:07:44 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 12 19:07:44 launchpad ollama[1757]: time=2025-04-12T19:07:44.139-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 12 19:07:44 launchpad ollama[1757]: time=2025-04-12T19:07:44.149-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 12 19:07:44 launchpad ollama[1757]: time=2025-04-12T19:07:44.150-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 12 19:07:44 launchpad ollama[1757]: time=2025-04-12T19:07:44.152-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama260338899/runners
+Apr 12 19:07:47 launchpad ollama[1757]: time=2025-04-12T19:07:47.186-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 12 19:07:47 launchpad ollama[1757]: time=2025-04-12T19:07:47.186-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 12 19:07:47 launchpad ollama[1757]: time=2025-04-12T19:07:47.186-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 19:07:47 launchpad ollama[1757]: time=2025-04-12T19:07:47.187-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 19:07:47 launchpad ollama[1757]: time=2025-04-12T19:07:47.187-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 12 19:07:47 launchpad ollama[1757]: time=2025-04-12T19:07:47.402-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 11:41:04 launchpad ollama[1757]: [GIN] 2025/04/13 - 11:41:04 | 200 |    3.878869ms |       127.0.0.1 | GET      "/api/tags"
+Apr 13 11:41:04 launchpad ollama[1757]: [GIN] 2025/04/13 - 11:41:04 | 200 |     634.032µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 11:41:04 launchpad ollama[1757]: [GIN] 2025/04/13 - 11:41:04 | 200 |     589.933µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 11:52:35 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 11:52:35 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 11:52:35 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 11:52:35 launchpad systemd[1]: ollama.service: Consumed 3.511s CPU time, 787.4M memory peak, 234.7M read from disk, 508.1M written to disk.
+-- Boot 40317137f26d4efc9c418a400690a2a8 --
+Apr 13 11:53:16 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 11:53:16 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 11:53:16 launchpad ollama[1754]: 2025/04/13 11:53:16 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 11:53:16 launchpad ollama[1754]: time=2025-04-13T11:53:16.436-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 11:53:16 launchpad ollama[1754]: time=2025-04-13T11:53:16.445-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 11:53:16 launchpad ollama[1754]: time=2025-04-13T11:53:16.446-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 11:53:16 launchpad ollama[1754]: time=2025-04-13T11:53:16.447-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1506917716/runners
+Apr 13 11:53:19 launchpad ollama[1754]: time=2025-04-13T11:53:19.475-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 13 11:53:19 launchpad ollama[1754]: time=2025-04-13T11:53:19.476-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 11:53:19 launchpad ollama[1754]: time=2025-04-13T11:53:19.476-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 11:53:19 launchpad ollama[1754]: time=2025-04-13T11:53:19.476-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 11:53:19 launchpad ollama[1754]: time=2025-04-13T11:53:19.476-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 11:53:19 launchpad ollama[1754]: time=2025-04-13T11:53:19.697-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 12:15:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:15:00 | 200 |    3.728711ms |       127.0.0.1 | GET      "/api/tags"
+Apr 13 12:15:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:15:00 | 200 |     620.216µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 12:15:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:15:00 | 200 |      581.55µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 12:18:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:18:38 | 200 |     576.644µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 12:18:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:18:38 | 200 |       26.46µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.978-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10194780160 required="6.2 GiB"
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.978-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.2 GiB" free_swap="68.9 GiB"
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.979-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.980-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39129"
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.980-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.980-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 12:22:04 launchpad ollama[1754]: time=2025-04-13T12:22:04.981-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 12:22:05 launchpad ollama[11080]: INFO [main] build info | build=0 commit="unknown" tid="139944833957888" timestamp=1744572125
+Apr 13 12:22:05 launchpad ollama[11080]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139944833957888" timestamp=1744572125 total_threads=16
+Apr 13 12:22:05 launchpad ollama[11080]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39129" tid="139944833957888" timestamp=1744572125
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 12:22:05 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 12:22:05 launchpad ollama[1754]: time=2025-04-13T12:22:05.231-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 12:22:05 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 12:22:05 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 12:22:05 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 12:22:05 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 12:22:05 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 12:22:10 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 12:22:10 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 12:22:10 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 12:22:10 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 12:22:10 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 12:22:10 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 12:22:10 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 12:22:11 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 12:22:11 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 12:22:11 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 12:22:11 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 12:22:11 launchpad ollama[11080]: INFO [main] model loaded | tid="139944833957888" timestamp=1744572131
+Apr 13 12:22:11 launchpad ollama[1754]: time=2025-04-13T12:22:11.250-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Apr 13 12:22:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:11 | 200 |  6.620345601s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:11 | 200 |  172.120741ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:12 | 200 |   166.67769ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:12 | 200 |  168.137847ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:14 | 200 |  2.579237733s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:17 | 200 |  2.499719065s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:18 | 200 |  735.220817ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:19 | 200 |  684.265221ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:23 | 200 |  4.246424967s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:27 | 200 |    4.3554732s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:29 | 200 |  1.290874466s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:30 | 200 |  1.287759135s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:31 | 200 |  708.163748ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:32 | 200 |  658.403673ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:32 | 200 |   94.787215ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:32 | 200 |   174.68155ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:38 | 200 |  5.692742201s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:43 | 200 |  5.635757698s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:49 | 200 |  5.697265074s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:22:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:22:56 | 200 |  7.068241853s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:00 | 200 |  3.788666964s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:06 | 200 |  5.622250292s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:12 | 200 |  5.676904881s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:18 | 200 |  5.740432975s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:25 | 200 |  7.038507573s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:28 | 200 |  3.788416995s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:29 | 200 |  135.694108ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:29 | 200 |  133.483345ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:29 | 200 |  125.359869ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:29 | 200 |  128.281804ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:30 | 200 |  133.415387ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:37 | 200 |  7.421019595s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:44 | 200 |  7.090316178s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:52 | 200 |  7.408047169s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:59 | 200 |  7.137196195s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:23:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:23:59 | 200 |   87.397788ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:24:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:24:00 | 200 |   229.93963ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:24:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:24:00 | 200 |  227.346569ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:04 | 200 |  1.002033433s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:05 | 200 |  899.358758ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:06 | 200 |  808.103486ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:07 | 200 |  800.237358ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:12 | 200 |  507.578689ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:12 | 200 |  458.547326ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:18 | 200 |  180.118115ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:18 | 200 |  183.429445ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:29 | 200 |  2.176445677s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:31 | 200 |  2.136366913s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:40 | 200 |   2.09451467s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:42 | 200 |  2.054908681s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:43 | 200 |   461.03547ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:43 | 200 |  445.967572ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:43 | 200 |  123.176326ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:44 | 200 |  203.441727ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:44 | 200 |  261.427921ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:26:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:26:44 | 200 |  256.136666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:00 | 200 |  198.409698ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:00 | 200 |  192.951944ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:00 | 200 |  273.013556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:01 | 200 |  255.900819ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:01 | 200 |  262.591282ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:01 | 200 |  257.439468ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:02 | 200 |  300.355752ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:02 | 200 |  290.196551ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:02 | 200 |  203.460911ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:02 | 200 |  202.855722ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:03 | 200 |  173.540775ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:03 | 200 |  174.549117ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:03 | 200 |  263.094569ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:04 | 200 |  259.544499ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:04 | 200 |  169.070722ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:04 | 200 |  169.492949ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:04 | 200 |  264.753356ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:05 | 200 |  258.151333ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:38 | 200 |  4.578377588s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:42 | 200 |  4.315993147s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:44 | 200 |  1.834749955s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:49 | 200 |  4.446968594s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:53 | 200 |  4.252742931s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:55 | 200 |  1.799616519s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:56 | 200 |  433.214643ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:56 | 200 |   458.75978ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:56 | 200 |  176.232613ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:57 | 200 |  213.143787ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:57 | 200 |  109.318915ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:57 | 200 |  150.767635ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:58 | 200 |  432.736027ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:58 | 200 |  321.922819ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:27:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:27:59 | 200 |  820.851375ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:00 | 200 |  840.302343ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:00 | 200 |  382.753486ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:01 | 200 |  396.209504ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:01 | 200 |  139.073883ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:01 | 200 |  137.244626ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:02 | 200 |  817.011491ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:03 | 200 |  753.764256ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:03 | 200 |  130.102094ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:03 | 200 |  210.247266ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:04 | 200 |  710.108091ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:05 | 200 |  622.104944ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:05 | 200 |  257.013709ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:05 | 200 |  247.948671ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:05 | 200 |  220.030658ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:06 | 200 |  217.357866ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:20 | 200 |  301.292935ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:21 | 200 |  291.086893ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:46 | 200 |  911.630883ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:28:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:28:47 | 200 |  774.311509ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:17 | 200 |  291.385876ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:18 | 200 |  289.243518ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:25 | 200 |  224.914551ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:26 | 200 |  231.370866ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:26 | 200 |  230.858649ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:26 | 200 |  223.030612ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:27 | 200 |   896.39329ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:28 | 200 |   798.67082ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:30 | 200 |  1.581948538s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:31 | 200 |   1.60936706s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:37 | 200 |  5.032478053s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:38 | 200 |  1.463793265s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:43 | 200 |  5.225126273s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:44 | 200 |  1.716382735s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:44 | 200 |  252.559362ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:45 | 200 |  1.566418451s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:46 | 200 |  356.504455ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:46 | 200 |  363.738348ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:46 | 200 |  107.084681ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:46 | 200 |  147.478833ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:58 | 200 |  222.089203ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:29:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:29:58 | 200 |  244.987296ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:05 | 200 |  507.131907ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:05 | 200 |  303.072736ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:05 | 200 |   96.926176ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:06 | 200 |  178.149413ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:09 | 200 |  183.990177ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:10 | 200 |  191.941038ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:18 | 200 |  327.150573ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:18 | 200 |  332.403537ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:19 | 200 |  420.449719ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:19 | 200 |   326.71694ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:20 | 200 |  623.232714ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:20 | 200 |  565.881072ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:21 | 200 |  423.253341ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:21 | 200 |   393.98183ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:22 | 200 |  563.263425ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:23 | 200 |  556.576111ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:23 | 200 |  387.783433ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:23 | 200 |  291.210528ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:30 | 200 |  212.655474ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:31 | 200 |  212.689132ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:31 | 200 |  229.603477ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:31 | 200 |  222.112378ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:46 | 200 |   4.02570548s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:30:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:30:50 | 200 |   4.08182804s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:15 | 200 |  188.784869ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:15 | 200 |  184.990732ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:16 | 200 |  560.925517ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:16 | 200 |  397.926064ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:26 | 200 |  319.963639ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:27 | 200 |  319.499331ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:31 | 200 |  370.174129ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:31 | 200 |  338.138337ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:31:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:31:59 | 200 |  1.990467673s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:32:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:32:01 | 200 |  1.750887892s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:32:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:32:01 | 200 |   581.70396ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 12:32:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 12:32:02 | 200 |  524.434263ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.750-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9575792640 required="6.2 GiB"
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.750-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.8 GiB" free_swap="68.9 GiB"
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.751-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.752-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42997"
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.752-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.752-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 13:23:30 launchpad ollama[1754]: time=2025-04-13T13:23:30.752-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 13:23:30 launchpad ollama[25157]: INFO [main] build info | build=0 commit="unknown" tid="140401670529024" timestamp=1744575810
+Apr 13 13:23:30 launchpad ollama[25157]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140401670529024" timestamp=1744575810 total_threads=16
+Apr 13 13:23:30 launchpad ollama[25157]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42997" tid="140401670529024" timestamp=1744575810
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 13:23:30 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 13:23:31 launchpad ollama[1754]: time=2025-04-13T13:23:31.003-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 13:23:31 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 13:23:31 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 13:23:31 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 13:23:31 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 13:23:31 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 13:23:31 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 13:23:31 launchpad ollama[25157]: INFO [main] model loaded | tid="140401670529024" timestamp=1744575811
+Apr 13 13:23:32 launchpad ollama[1754]: time=2025-04-13T13:23:32.007-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 13:23:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:23:35 | 200 |  5.415993131s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:23:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:23:40 | 200 |  4.062804663s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:23:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:23:45 | 200 |  5.196035713s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:23:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:23:50 | 200 |  4.921320163s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:23:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:23:55 | 200 |  4.698829369s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:00 | 200 |  5.496283443s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:05 | 200 |  4.915490411s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:10 | 200 |  4.922804777s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:16 | 200 |   5.37305929s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:21 | 200 |  4.922397261s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:26 | 200 |  4.797622009s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:31 | 200 |  5.552054579s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:36 | 200 |  4.902656616s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:41 | 200 |  4.866707608s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:48 | 200 |  5.545103722s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:53 | 200 |  5.094984698s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:24:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:24:58 | 200 |  4.939207909s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:04 | 200 |  5.649536123s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:09 | 200 |   4.71763893s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:14 | 200 |  5.182880558s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:20 | 200 |  5.840619663s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:26 | 200 |  5.745745863s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:30 | 200 |  4.924080051s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:36 | 200 |  5.067641379s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:40 | 200 |  4.325283141s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:45 | 200 |  4.519983799s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:49 | 200 |  4.658572694s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:25:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:25:54 | 200 |   5.17862032s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:00 | 200 |  5.570241801s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:02 | 200 |     593.805µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 13:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:02 | 200 |      23.853µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 13:26:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:06 | 200 |  5.471502071s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:11 | 200 |  5.872309744s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:17 | 200 |  5.286808418s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:23 | 200 |  6.135609698s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:27 | 200 |  4.372406392s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:33 | 200 |  5.131204286s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:37 | 200 |  4.268732371s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:41 | 200 |  4.477129755s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:46 | 200 |  4.496509701s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:50 | 200 |  4.488880857s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:26:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:26:55 | 200 |  5.020606129s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:01 | 200 |  5.325673895s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:06 | 200 |  4.815120267s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:10 | 200 |  4.495471187s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:16 | 200 |  5.461260633s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:21 | 200 |  5.702900588s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:28 | 200 |  6.186560401s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:33 | 200 |   5.63639189s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:39 | 200 |  5.631363555s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:45 | 200 |  6.003421144s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:51 | 200 |  5.524773823s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:27:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:27:56 | 200 |  4.964827112s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:02 | 200 |  6.005929633s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:07 | 200 |   5.72471856s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:13 | 200 |  5.769708419s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:19 | 200 |  6.137424034s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:26 | 200 |  6.133989594s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:32 | 200 |  6.510034616s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:38 | 200 |  5.754626249s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:43 | 200 |  5.260813264s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:49 | 200 |  5.682071603s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:28:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:28:55 | 200 |  5.567776363s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:00 | 200 |  5.158658795s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:06 | 200 |  5.638567621s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:12 | 200 |  6.349813088s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:17 | 200 |  4.757526294s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:22 | 200 |  5.546029631s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:28 | 200 |  5.197849775s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:33 | 200 |  5.500846095s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:39 | 200 |  5.654043011s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:45 | 200 |  5.721036589s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:50 | 200 |  5.064005236s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:29:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:29:55 | 200 |  5.816013542s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:01 | 200 |  5.668873855s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:07 | 200 |  5.660936918s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:13 | 200 |  5.837027525s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:19 | 200 |  5.895537466s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:25 | 200 |  6.482066041s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:31 | 200 |  5.836212534s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:37 | 200 |   5.76839687s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:43 | 200 |  6.209691243s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:50 | 200 |  6.437179005s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:30:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:30:56 | 200 |  6.246523974s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:02 | 200 |  5.639837003s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:06 | 200 |  4.798913797s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:12 | 200 |  5.975174644s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:18 | 200 |  5.606964754s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:24 | 200 |  5.813943653s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:30 | 200 |  6.250468553s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:35 | 200 |  5.233499699s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:42 | 200 |  5.975729359s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:48 | 200 |  6.100053273s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:54 | 200 |  6.231265195s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:31:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:31:59 | 200 |  5.522625483s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:06 | 200 |   6.05436449s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:12 | 200 |  6.272603497s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:18 | 200 |  5.881001648s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:24 | 200 |  6.275035141s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:30 | 200 |  5.452967453s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:36 | 200 |  5.942843035s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:41 | 200 |  5.677462949s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:47 | 200 |  5.878414618s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:53 | 200 |  5.845273746s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:32:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:32:59 | 200 |  5.909972271s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:05 | 200 |  6.136041957s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:12 | 200 |  6.279992125s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:18 | 200 |  6.044054175s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:24 | 200 |  5.920208613s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:30 | 200 |  5.989413926s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:36 | 200 |  6.188999141s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:41 | 200 |  5.115428148s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:46 | 200 |  5.405639943s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:52 | 200 |  5.584974571s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:33:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:33:58 | 200 |  5.833793973s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:03 | 200 |  5.491157713s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:10 | 200 |  5.987821542s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:16 | 200 |  6.057862445s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:21 | 200 |  5.777213823s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:27 | 200 |  5.795979087s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:33 | 200 |  6.190637203s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:39 | 200 |  5.786517686s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:46 | 200 |  6.210268898s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:52 | 200 |  5.931000203s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:34:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:34:57 | 200 |  5.500044322s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:03 | 200 |  5.621318681s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:08 | 200 |  5.697127457s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:14 | 200 |  5.808690627s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:20 | 200 |  5.555098883s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:26 | 200 |   6.01157704s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:32 | 200 |  5.501388107s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:37 | 200 |   5.62617979s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:43 | 200 |  5.987086515s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:49 | 200 |  5.732456176s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:35:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:35:54 | 200 |  5.432410098s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:00 | 200 |  5.077717101s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:05 | 200 |  4.982458955s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:11 | 200 |    6.1086728s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:17 | 200 |  6.086346691s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:23 | 200 |  5.730260276s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:28 | 200 |  5.582906284s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:34 | 200 |  5.733756533s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:39 | 200 |  5.165732539s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:45 | 200 |  5.479445315s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:51 | 200 |  5.779007014s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:36:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:36:56 | 200 |  5.477441837s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:00 | 200 |  4.349689258s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:06 | 200 |  5.463534221s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:12 | 200 |  5.708383993s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:17 | 200 |  5.335859063s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:23 | 200 |  5.971078057s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:29 | 200 |  5.604126405s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:34 | 200 |  5.648820459s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:40 | 200 |  5.255048549s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:45 | 200 |  5.707188463s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:51 | 200 |  5.477529583s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:37:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:37:57 | 200 |  6.053374263s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:03 | 200 |  5.513459723s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:08 | 200 |  5.493355872s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:14 | 200 |  5.347172054s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:19 | 200 |  5.752443182s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:25 | 200 |  5.975179839s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:31 | 200 |  6.039535814s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:37 | 200 |  5.605949784s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:43 | 200 |  6.289499502s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:49 | 200 |   5.83331787s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:38:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:38:55 | 200 |   5.85643737s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:01 | 200 |  5.822546245s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:07 | 200 |  6.249896892s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:13 | 200 |    5.6911034s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:19 | 200 |    6.0428219s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:25 | 200 |   5.57264006s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:30 | 200 |  4.988640788s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:34 | 200 |   4.64227828s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:39 | 200 |   4.41007889s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:44 | 200 |  4.606295726s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:48 | 200 |  4.635780426s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:53 | 200 |  4.686856472s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:39:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:39:58 | 200 |  4.976407078s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:03 | 200 |  5.001600696s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:09 | 200 |  5.486809128s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:13 | 200 |  4.805197748s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:19 | 200 |  5.640329132s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:24 | 200 |  5.153029188s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:30 | 200 |  5.433601033s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:35 | 200 |  5.133185895s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:40 | 200 |  5.502278335s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:46 | 200 |  5.491523638s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:51 | 200 |  4.931678907s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:40:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:40:56 | 200 |  5.266544924s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:02 | 200 |  5.933332881s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:08 | 200 |  5.495537267s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:13 | 200 |  4.836678527s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:18 | 200 |  4.989040022s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:23 | 200 |  5.329049969s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:28 | 200 |  5.234773845s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:33 | 200 |  4.962046216s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:39 | 200 |  5.344885645s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:44 | 200 |  5.375789875s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:49 | 200 |  4.958244411s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:54 | 200 |  5.102096939s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:41:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:41:55 | 200 |  431.821525ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:08 | 200 |  5.749759226s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:13 | 200 |  5.065595582s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:19 | 200 |  5.013637743s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:24 | 200 |  5.711333129s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:29 | 200 |  4.860127326s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:34 | 200 |  5.122585296s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:40 | 200 |  5.861769567s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:46 | 200 |  5.829885673s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:51 | 200 |  5.035542284s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:42:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:42:56 | 200 |  5.155020132s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:01 | 200 |   4.44098361s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:05 | 200 |  4.589102545s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:10 | 200 |  4.652115898s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:15 | 200 |  5.171521688s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:21 | 200 |  5.378501947s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:26 | 200 |  5.372232475s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:32 | 200 |    5.8945508s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:38 | 200 |  5.479668875s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:44 | 200 |  6.097543657s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:48 | 200 |  4.290027126s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:53 | 200 |  5.181262161s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:43:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:43:58 | 200 |  4.265586185s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:02 | 200 |  4.473588432s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:07 | 200 |  4.453949267s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:11 | 200 |  4.559658765s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:16 | 200 |   5.06389266s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:22 | 200 |  5.447513438s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:27 | 200 |  4.776504442s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:31 | 200 |  4.566599545s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:37 | 200 |  5.546128959s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:43 | 200 |  5.752772539s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:49 | 200 |  6.234532801s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:44:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:44:55 | 200 |  5.605820534s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:00 | 200 |  5.677275317s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:06 | 200 |   6.05924743s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:12 | 200 |  5.577431004s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:17 | 200 |  4.968856659s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:23 | 200 |  6.106475063s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:29 | 200 |  5.846626533s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:35 | 200 |  5.776827368s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:41 | 200 |  6.188789661s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:47 | 200 |  6.182290689s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:45:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:45:54 | 200 |   6.60229112s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:00 | 200 |  5.877782518s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:05 | 200 |  5.338586691s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:11 | 200 |  5.789687501s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:17 | 200 |  5.606892329s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:22 | 200 |  5.192657539s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:28 | 200 |   5.75842992s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:34 | 200 |  6.487501692s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:39 | 200 |  5.131103037s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:45 | 200 |  5.845605572s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:51 | 200 |  5.599578609s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:46:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:46:57 | 200 |   5.97033032s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:03 | 200 |  6.003435148s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:09 | 200 |  5.957370711s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:14 | 200 |  5.334350662s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:20 | 200 |  5.814787472s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:26 | 200 |  5.762704932s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:32 | 200 |  5.717739312s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:38 | 200 |  5.819520894s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:44 | 200 |  5.812038753s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:50 | 200 |  6.384932715s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:47:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:47:56 | 200 |     5.746921s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:02 | 200 |  5.892990121s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:08 | 200 |  6.084744143s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:14 | 200 |  6.498642586s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:21 | 200 |   6.40168326s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:26 | 200 |  5.655228905s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:31 | 200 |  4.894272095s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:37 | 200 |  5.943296721s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:43 | 200 |  5.655136067s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:49 | 200 |  5.866005009s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:48:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:48:55 | 200 |  6.253577932s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:01 | 200 |  5.270707366s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:07 | 200 |     6.037823s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:13 | 200 |  6.254477182s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:19 | 200 |  6.380542311s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:25 | 200 |  5.623668814s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:31 | 200 |   6.12393075s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:37 | 200 |  6.187287676s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:43 | 200 |  5.836886167s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:50 | 200 |  6.250676853s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:49:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:49:55 | 200 |  5.485741682s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:01 | 200 |  6.026935536s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:07 | 200 |  5.749063147s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:13 | 200 |  5.955839625s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:19 | 200 |   5.91419045s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:25 | 200 |  5.918028006s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:31 | 200 |  6.101490779s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:37 | 200 |  6.276725098s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:44 | 200 |  6.131594161s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:49 | 200 |  5.796885366s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:50:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:50:56 | 200 |  6.096006088s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:02 | 200 |   6.09558453s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:07 | 200 |  5.447946805s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:13 | 200 |  5.713022478s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:19 | 200 |  5.843924371s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:25 | 200 |  6.095347033s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:31 | 200 |  5.740966368s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:37 | 200 |  6.347683448s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:44 | 200 |  6.435408876s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:50 | 200 |  5.964024895s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:51:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:51:56 | 200 |  6.224970635s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:03 | 200 |  6.686047249s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:09 | 200 |  6.294346113s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:15 | 200 |  6.307390503s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:21 | 200 |  6.118937514s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:27 | 200 |   5.83837977s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:33 | 200 |   5.83770728s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:39 | 200 |  5.526917064s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:45 | 200 |   5.84416987s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:51 | 200 |  5.873270608s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:52:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:52:57 | 200 |  6.339226388s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:03 | 200 |  5.746402708s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:09 | 200 |  5.983170946s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:15 | 200 |  6.066471507s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:21 | 200 |  5.797107428s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:26 | 200 |  5.668402952s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:32 | 200 |  5.310502643s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:37 | 200 |  5.023337994s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:43 | 200 |   6.11730093s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:49 | 200 |  5.963660689s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:53:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:53:55 | 200 |  5.969444444s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:01 | 200 |  6.001424453s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:07 | 200 |  5.927428618s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:13 | 200 |  5.540336534s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:18 | 200 |  5.597380048s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:24 | 200 |  6.127936571s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:30 | 200 |  5.806394948s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:35 | 200 |  4.594824523s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:41 | 200 |  5.833699288s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:46 | 200 |  5.638136462s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:52 | 200 |  5.699265974s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:54:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:54:58 | 200 |   6.06026667s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:04 | 200 |  5.917653905s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:10 | 200 |  5.696973403s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:16 | 200 |  5.567990639s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:21 | 200 |  5.856747188s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:27 | 200 |  5.698476772s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:33 | 200 |  5.964897303s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:39 | 200 |  5.460831696s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:44 | 200 |  5.594724702s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:50 | 200 |  5.402888637s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:55:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:55:55 | 200 |  5.688187971s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:01 | 200 |    5.9337402s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:07 | 200 |  6.021417447s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:13 | 200 |   5.71824651s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:20 | 200 |  6.334822935s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:26 | 200 |  5.925755234s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:32 | 200 |  5.976887504s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:37 | 200 |  5.693066817s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:44 | 200 |  6.240373588s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:49 | 200 |  5.583000507s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:56:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:56:55 | 200 |  6.102539493s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:01 | 200 |  5.672866759s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:06 | 200 |  5.033931269s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:11 | 200 |  4.531799214s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:15 | 200 |   4.41780876s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:20 | 200 |  4.686500532s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:25 | 200 |  4.603118758s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:29 | 200 |  4.647055381s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:34 | 200 |  4.963434784s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:39 | 200 |   4.96611803s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:45 | 200 |  5.447071036s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:50 | 200 |  4.752277675s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:57:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:57:55 | 200 |  5.605813212s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:00 | 200 |  5.100829576s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:06 | 200 |  5.435428538s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:11 | 200 |   5.16640024s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:17 | 200 |  5.601487879s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:22 | 200 |  5.625300132s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:27 | 200 |     4.949263s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:33 | 200 |   5.26491666s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:38 | 200 |   5.84911171s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:44 | 200 |  5.393961401s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:49 | 200 |   4.77867586s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:54 | 200 |  4.928236182s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:58:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:58:59 | 200 |  5.329308572s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:04 | 200 |  5.344585045s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:09 | 200 |  4.906445709s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:15 | 200 |  5.301343198s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:20 | 200 |  5.464103927s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:25 | 200 |  5.037028726s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:30 | 200 |  5.063213861s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:31 | 200 |  436.823719ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:39 | 200 |  444.414215ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:39 | 200 |  333.741721ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:46 | 200 |  6.584590072s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:47 | 200 |  1.076481124s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:54 | 200 |  6.531209513s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:55 | 200 |   997.65372ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:55 | 200 |  101.669931ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:55 | 200 |  184.889674ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:56 | 200 |  351.240253ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:56 | 200 |  332.652948ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 13:59:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 13:59:59 | 200 |  2.959259391s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:02 | 200 |  2.904589998s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:05 | 200 |  2.978848365s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:08 | 200 |  3.092754275s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:14 | 200 |  5.053710279s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:19 | 200 |  5.046706195s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:24 | 200 |  5.370630804s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:26 | 200 |  1.264594055s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:31 | 200 |   5.18030291s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:36 | 200 |  5.071941887s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:41 | 200 |  5.249375247s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:43 | 200 |  1.227306032s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:43 | 200 |  130.518129ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:00:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:00:43 | 200 |  129.362381ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:04:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:04:55 | 200 |     556.304µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 14:05:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:05:37 | 200 |      27.382µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.646-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9613017088 required="6.2 GiB"
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.646-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.6 GiB" free_swap="68.9 GiB"
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.646-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.648-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45703"
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.648-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.648-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.648-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:06:33 launchpad ollama[37332]: INFO [main] build info | build=0 commit="unknown" tid="140346057936896" timestamp=1744578393
+Apr 13 14:06:33 launchpad ollama[37332]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140346057936896" timestamp=1744578393 total_threads=16
+Apr 13 14:06:33 launchpad ollama[37332]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45703" tid="140346057936896" timestamp=1744578393
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:06:33 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:06:33 launchpad ollama[1754]: time=2025-04-13T14:06:33.899-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:06:33 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:06:33 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:06:33 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:06:33 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:06:33 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:06:34 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:06:34 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:06:34 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:06:34 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:06:34 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:06:34 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:06:34 launchpad ollama[37332]: INFO [main] model loaded | tid="140346057936896" timestamp=1744578394
+Apr 13 14:06:34 launchpad ollama[1754]: time=2025-04-13T14:06:34.902-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:06:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:06:35 | 200 |  1.603741754s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:06:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:06:40 | 200 |  168.741295ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:06:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:06:41 | 200 |  922.237471ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:06:41 launchpad ollama[1754]: time=2025-04-13T14:06:41.327-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:06:41 launchpad ollama[1754]: time=2025-04-13T14:06:41.468-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.150-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.151-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.152-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 34297"
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.152-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.152-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.152-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:06:42 launchpad ollama[37378]: INFO [main] build info | build=0 commit="unknown" tid="140574654976000" timestamp=1744578402
+Apr 13 14:06:42 launchpad ollama[37378]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140574654976000" timestamp=1744578402 total_threads=16
+Apr 13 14:06:42 launchpad ollama[37378]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34297" tid="140574654976000" timestamp=1744578402
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:06:42 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:06:42 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:06:42 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:06:42 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:06:42 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:06:42 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:06:42 launchpad ollama[1754]: time=2025-04-13T14:06:42.403-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:06:49 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:06:49 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:06:49 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:06:49 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:06:50 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:06:50 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:06:50 launchpad ollama[37378]: INFO [main] model loaded | tid="140574654976000" timestamp=1744578410
+Apr 13 14:06:50 launchpad ollama[1754]: time=2025-04-13T14:06:50.676-07:00 level=INFO source=server.go:626 msg="llama runner started in 8.52 seconds"
+Apr 13 14:06:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:06:56 | 200 | 15.633349406s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:06:57 launchpad ollama[1754]: time=2025-04-13T14:06:57.018-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:06:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:06:57 | 200 |  879.132204ms |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:06:57 launchpad ollama[1754]: time=2025-04-13T14:06:57.931-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:06:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:06:59 | 200 |  1.297501798s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:07:27 launchpad ollama[1754]: time=2025-04-13T14:07:27.939-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="696.9 MiB"
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.662-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9739173888 required="6.2 GiB"
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.663-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.5 GiB" free_swap="68.9 GiB"
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.663-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.664-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36439"
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.664-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.664-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.664-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:07:28 launchpad ollama[39832]: INFO [main] build info | build=0 commit="unknown" tid="139924029399040" timestamp=1744578448
+Apr 13 14:07:28 launchpad ollama[39832]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139924029399040" timestamp=1744578448 total_threads=16
+Apr 13 14:07:28 launchpad ollama[39832]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36439" tid="139924029399040" timestamp=1744578448
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:07:28 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:07:28 launchpad ollama[1754]: time=2025-04-13T14:07:28.915-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:07:28 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:07:28 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:07:28 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:07:28 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:07:28 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:07:29 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:07:29 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:07:29 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:07:29 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:07:29 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:07:29 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:07:29 launchpad ollama[39832]: INFO [main] model loaded | tid="139924029399040" timestamp=1744578449
+Apr 13 14:07:29 launchpad ollama[1754]: time=2025-04-13T14:07:29.918-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:07:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:07:30 | 200 |  2.330926959s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:07:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:07:30 | 200 |  166.981297ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:07:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:07:31 | 200 |  1.139997584s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:07:31 launchpad ollama[1754]: time=2025-04-13T14:07:31.441-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:07:31 launchpad ollama[1754]: time=2025-04-13T14:07:31.586-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.267-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.5 GiB" free_swap="68.9 GiB"
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.267-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.268-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 40181"
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.269-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.269-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.269-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:07:32 launchpad ollama[39868]: INFO [main] build info | build=0 commit="unknown" tid="139770075373568" timestamp=1744578452
+Apr 13 14:07:32 launchpad ollama[39868]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139770075373568" timestamp=1744578452 total_threads=16
+Apr 13 14:07:32 launchpad ollama[39868]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40181" tid="139770075373568" timestamp=1744578452
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:07:32 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:07:32 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:07:32 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:07:32 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:07:32 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:07:32 launchpad ollama[1754]: time=2025-04-13T14:07:32.564-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:07:32 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:07:33 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:07:33 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:07:33 launchpad ollama[39868]: INFO [main] model loaded | tid="139770075373568" timestamp=1744578453
+Apr 13 14:07:33 launchpad ollama[1754]: time=2025-04-13T14:07:33.568-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:07:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:07:37 | 200 |   6.08534954s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.167-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="628.2 MiB"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.877-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9679863808 required="6.2 GiB"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.877-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.878-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.879-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41487"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.879-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.879-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:08:12 launchpad ollama[1754]: time=2025-04-13T14:08:12.879-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:08:12 launchpad ollama[40988]: INFO [main] build info | build=0 commit="unknown" tid="140242398044160" timestamp=1744578492
+Apr 13 14:08:12 launchpad ollama[40988]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140242398044160" timestamp=1744578492 total_threads=16
+Apr 13 14:08:12 launchpad ollama[40988]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41487" tid="140242398044160" timestamp=1744578492
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:08:12 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:08:13 launchpad ollama[1754]: time=2025-04-13T14:08:13.131-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:08:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:08:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:08:13 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:08:13 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:08:13 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:08:13 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:08:13 launchpad ollama[40988]: INFO [main] model loaded | tid="140242398044160" timestamp=1744578493
+Apr 13 14:08:14 launchpad ollama[1754]: time=2025-04-13T14:08:14.135-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:08:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:16 | 200 |  4.806453389s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:31 | 200 |  175.757221ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:32 | 200 |  172.517442ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:33 | 200 |  1.288758055s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:33 | 200 |   88.791494ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:33 | 200 |  170.831309ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:34 | 200 |   952.11716ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:08:34 launchpad ollama[1754]: time=2025-04-13T14:08:34.760-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:08:34 launchpad ollama[1754]: time=2025-04-13T14:08:34.911-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.602-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.602-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.603-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 35291"
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.603-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.603-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.603-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:08:35 launchpad ollama[41137]: INFO [main] build info | build=0 commit="unknown" tid="139928022212608" timestamp=1744578515
+Apr 13 14:08:35 launchpad ollama[41137]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139928022212608" timestamp=1744578515 total_threads=16
+Apr 13 14:08:35 launchpad ollama[41137]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35291" tid="139928022212608" timestamp=1744578515
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:08:35 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:08:35 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:08:35 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:08:35 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:08:35 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:08:35 launchpad ollama[1754]: time=2025-04-13T14:08:35.904-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:08:35 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:08:36 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:08:36 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:08:36 launchpad ollama[41137]: INFO [main] model loaded | tid="139928022212608" timestamp=1744578516
+Apr 13 14:08:36 launchpad ollama[1754]: time=2025-04-13T14:08:36.908-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:08:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:08:45 | 200 | 10.705663426s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:10:07 launchpad ollama[1754]: time=2025-04-13T14:10:07.659-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="538.5 MiB"
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.384-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9636937728 required="6.2 GiB"
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.384-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.4 GiB" free_swap="68.9 GiB"
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.384-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.385-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38155"
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.386-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.386-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.386-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:10:08 launchpad ollama[43401]: INFO [main] build info | build=0 commit="unknown" tid="140660544999424" timestamp=1744578608
+Apr 13 14:10:08 launchpad ollama[43401]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140660544999424" timestamp=1744578608 total_threads=16
+Apr 13 14:10:08 launchpad ollama[43401]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38155" tid="140660544999424" timestamp=1744578608
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:10:08 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:10:08 launchpad ollama[1754]: time=2025-04-13T14:10:08.637-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:10:08 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:10:08 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:10:08 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:10:08 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:10:08 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:10:09 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:10:09 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:10:09 launchpad ollama[43401]: INFO [main] model loaded | tid="140660544999424" timestamp=1744578609
+Apr 13 14:10:09 launchpad ollama[1754]: time=2025-04-13T14:10:09.641-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:10:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:10 | 200 |  2.942440113s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:11 | 200 |  4.015087509s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:13 | 200 |  5.971331023s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:27 | 200 |  785.267226ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:47 | 200 |  533.626034ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:47 | 200 |   525.04085ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:47 | 200 |  579.684908ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:47 | 200 |  933.626698ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:48 | 200 |  1.627546595s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:48 | 200 |   1.05933701s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:48 | 200 |  1.444562882s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:49 | 200 |  1.080242134s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:49 | 200 |  2.005128209s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:50 | 200 |    1.5564227s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:50 | 200 |  1.631973032s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:50 | 200 |  1.428060482s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:53 | 200 |  6.875375788s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:10:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:10:58 | 200 |  4.312372391s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:11:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:11:00 | 200 |  1.872216451s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:11:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:11:30 | 200 |  222.520879ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:11:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:11:30 | 200 |  307.261977ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:02 | 200 |  178.863566ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:02 | 200 |  168.590763ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:03 | 200 |  1.229131089s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:04 | 200 |   84.885653ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:04 | 200 |  165.331666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:05 | 200 |  1.205902473s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:05 | 200 |   84.907812ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:05 | 200 |  165.622158ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:06 | 200 |   810.74577ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:06 | 200 |   85.652738ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:06 | 200 |  165.694432ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:07 | 200 |  1.189130444s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:08 | 200 |   84.733814ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:08 | 200 |  165.722535ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:08 | 200 |  739.458729ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:09 | 200 |    86.31391ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:09 | 200 |  167.013607ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:09 | 200 |  614.948209ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:09 | 200 |   83.654423ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:10 | 200 |  168.015712ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:10 | 200 |  464.896495ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:10 | 200 |  167.517838ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:10 | 200 |  164.307641ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:11 | 200 |  292.698229ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:11 | 200 |  167.910215ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:11 | 200 |  167.252018ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:11 | 200 |  201.751736ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:11 | 200 |  166.702799ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:12 | 200 |  165.305206ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:12 | 200 |  457.359722ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:12 | 200 |   86.846549ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:12 | 200 |  170.970373ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:13 | 200 |  611.425013ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:13 | 200 |   85.038862ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:13 | 200 |  170.081852ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:14 | 200 |  448.816767ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:14 | 200 |     166.767ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:14 | 200 |  165.092509ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:14 | 200 |  179.544664ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:14 | 200 |  166.649812ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:15 | 200 |  165.778387ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:15 | 200 |  877.670041ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:16 | 200 |   84.912889ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:16 | 200 |  165.850521ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:16 | 200 |  207.647324ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:16 | 200 |  166.458928ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:16 | 200 |  166.767253ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:17 | 200 |  686.543694ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:17 | 200 |  166.765162ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:17 | 200 |   167.16146ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:18 | 200 |  293.760466ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:18 | 200 |  166.895627ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:18 | 200 |  166.969389ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:18 | 200 |  257.445145ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:18 | 200 |   166.68461ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:19 | 200 |  165.672921ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:19 | 200 |  209.119235ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:19 | 200 |  167.356696ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:19 | 200 |  166.746511ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:19 | 200 |  227.172747ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:20 | 200 |  124.692001ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:20 | 200 |  166.931886ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:21 | 200 |  653.656061ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.061-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.205-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.871-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.871-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.872-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 33735"
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.872-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.872-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:12:21 launchpad ollama[1754]: time=2025-04-13T14:12:21.872-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:12:21 launchpad ollama[44133]: INFO [main] build info | build=0 commit="unknown" tid="140155783323648" timestamp=1744578741
+Apr 13 14:12:21 launchpad ollama[44133]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140155783323648" timestamp=1744578741 total_threads=16
+Apr 13 14:12:21 launchpad ollama[44133]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33735" tid="140155783323648" timestamp=1744578741
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:12:21 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:12:21 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:12:21 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:12:21 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:12:21 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:12:21 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:12:22 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: time=2025-04-13T14:12:22.170-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:12:22 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:12:22 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:12:22 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:12:22 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:12:22 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:12:22 launchpad ollama[44133]: INFO [main] model loaded | tid="140155783323648" timestamp=1744578742
+Apr 13 14:12:23 launchpad ollama[1754]: time=2025-04-13T14:12:23.174-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:12:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:12:35 | 200 | 14.697469857s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:13:10 launchpad ollama[1754]: time=2025-04-13T14:13:10.721-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="626.2 MiB"
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.416-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9660923904 required="6.2 GiB"
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.416-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.417-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.418-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43871"
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.418-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.418-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.418-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:13:11 launchpad ollama[48447]: INFO [main] build info | build=0 commit="unknown" tid="139872123678720" timestamp=1744578791
+Apr 13 14:13:11 launchpad ollama[48447]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139872123678720" timestamp=1744578791 total_threads=16
+Apr 13 14:13:11 launchpad ollama[48447]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43871" tid="139872123678720" timestamp=1744578791
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:13:11 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:13:11 launchpad ollama[1754]: time=2025-04-13T14:13:11.669-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:13:11 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:13:11 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:13:11 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:13:11 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:13:11 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:13:12 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:13:12 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:13:12 launchpad ollama[48447]: INFO [main] model loaded | tid="139872123678720" timestamp=1744578792
+Apr 13 14:13:12 launchpad ollama[1754]: time=2025-04-13T14:13:12.672-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:13:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:12 | 200 |  2.302713961s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:12 | 200 |  158.626226ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:13 | 200 |  937.588661ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:14 | 200 |   77.584479ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:14 | 200 |  158.260904ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:15 | 200 |  934.774772ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:15 | 200 |   77.100094ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:15 | 200 |  157.697444ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:16 | 200 |  779.977236ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:16 | 200 |   76.910703ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:16 | 200 |  158.041031ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:17 | 200 |  1.088751922s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:17 | 200 |   76.992547ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:17 | 200 |  158.428366ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:18 | 200 |  662.584613ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:18 | 200 |    77.92364ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:18 | 200 |  159.108835ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:19 | 200 |  1.096088798s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:19 | 200 |  116.956081ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:20 | 200 |  159.109475ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:20 | 200 |  471.290726ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:20 | 200 |  117.607633ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:20 | 200 |  157.309767ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:21 | 200 |  294.918527ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:21 | 200 |  156.769376ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:21 | 200 |  158.300633ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:21 | 200 |  202.921287ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:21 | 200 |  158.063971ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:22 | 200 |  158.939678ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:22 | 200 |  499.578141ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:22 | 200 |   76.557242ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:22 | 200 |  159.578275ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:23 | 200 |   966.20284ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:23 | 200 |   76.472993ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:23 | 200 |  156.247542ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:24 | 200 |   496.94847ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:24 | 200 |   119.74891ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:24 | 200 |  157.811469ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:24 | 200 |   177.39125ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:25 | 200 |  157.881945ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:25 | 200 |  161.005477ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:26 | 200 |  890.745596ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:26 | 200 |   76.973992ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:26 | 200 |   158.59959ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:26 | 200 |  208.329914ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:26 | 200 |  158.113737ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:26 | 200 |  159.837674ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:27 | 200 |  694.464519ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:27 | 200 |  118.615055ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:27 | 200 |  157.215416ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:28 | 200 |  293.984475ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:28 | 200 |   157.15125ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:28 | 200 |   157.90609ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:28 | 200 |  263.528027ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:29 | 200 |  158.802444ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:29 | 200 |  157.186862ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:29 | 200 |  208.829409ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:29 | 200 |   158.30828ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:29 | 200 |  156.715398ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:29 | 200 |  230.375061ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:30 | 200 |  119.461532ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:30 | 200 |  158.515154ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:31 | 200 |  1.170909187s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:13:31 launchpad ollama[1754]: time=2025-04-13T14:13:31.644-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:13:31 launchpad ollama[1754]: time=2025-04-13T14:13:31.791-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.2 GiB"
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.486-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.487-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.488-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 37457"
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.488-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.488-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.488-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:13:32 launchpad ollama[48554]: INFO [main] build info | build=0 commit="unknown" tid="139833438691328" timestamp=1744578812
+Apr 13 14:13:32 launchpad ollama[48554]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139833438691328" timestamp=1744578812 total_threads=16
+Apr 13 14:13:32 launchpad ollama[48554]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37457" tid="139833438691328" timestamp=1744578812
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:13:32 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:13:32 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:13:32 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:13:32 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:13:32 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:13:32 launchpad ollama[1754]: time=2025-04-13T14:13:32.788-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:13:32 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:13:33 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:13:33 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:13:33 launchpad ollama[48554]: INFO [main] model loaded | tid="139833438691328" timestamp=1744578813
+Apr 13 14:13:33 launchpad ollama[1754]: time=2025-04-13T14:13:33.791-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:13:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:13:37 | 200 |  6.297320741s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:14:02 launchpad ollama[1754]: time=2025-04-13T14:14:02.705-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="674.2 MiB"
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.405-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9715449856 required="6.2 GiB"
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.405-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.406-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.407-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44305"
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.407-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.407-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.407-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:14:03 launchpad ollama[49647]: INFO [main] build info | build=0 commit="unknown" tid="140377334923264" timestamp=1744578843
+Apr 13 14:14:03 launchpad ollama[49647]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140377334923264" timestamp=1744578843 total_threads=16
+Apr 13 14:14:03 launchpad ollama[49647]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44305" tid="140377334923264" timestamp=1744578843
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:14:03 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:14:03 launchpad ollama[1754]: time=2025-04-13T14:14:03.658-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:14:03 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:14:03 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:14:03 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:14:03 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:14:03 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:14:04 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:14:04 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:14:04 launchpad ollama[49647]: INFO [main] model loaded | tid="140377334923264" timestamp=1744578844
+Apr 13 14:14:04 launchpad ollama[1754]: time=2025-04-13T14:14:04.661-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:14:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:04 | 200 |  2.306030695s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:05 | 200 |  166.656088ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:06 | 200 |  1.138165787s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:06 | 200 |   87.422005ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:06 | 200 |  167.784936ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:07 | 200 |  1.183649612s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:07 | 200 |  129.367308ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:07 | 200 |  167.598403ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:08 | 200 |  773.757227ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:08 | 200 |   86.455469ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:08 | 200 |  170.154309ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:10 | 200 |    1.1932753s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:10 | 200 |   85.359108ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:10 | 200 |  169.954418ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:11 | 200 |  700.841168ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:11 | 200 |   86.867979ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:11 | 200 |  167.969871ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:12 | 200 |  824.591738ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:12 | 200 |   84.802423ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:12 | 200 |  166.892698ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:13 | 200 |  466.128794ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:13 | 200 |  126.287256ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:13 | 200 |   168.85059ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:13 | 200 |  294.322349ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:13 | 200 |  168.301309ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:13 | 200 |  167.423887ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:14 | 200 |  200.010107ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:14 | 200 |  164.964417ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:14 | 200 |  168.155695ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:15 | 200 |  500.390545ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:15 | 200 |   85.720084ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:15 | 200 |  167.746751ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:15 | 200 |  656.502986ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:16 | 200 |    86.17137ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:16 | 200 |  167.822759ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:16 | 200 |  452.497976ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:16 | 200 |  167.551093ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:17 | 200 |  166.131167ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:17 | 200 |  178.737683ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:17 | 200 |   169.00327ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:17 | 200 |  165.137246ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:18 | 200 |  884.661976ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:18 | 200 |    86.11287ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:18 | 200 |    169.0752ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:18 | 200 |  207.674445ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:19 | 200 |  167.969393ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:19 | 200 |  167.365528ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:19 | 200 |  690.611287ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:20 | 200 |  126.045026ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:20 | 200 |  168.416199ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:20 | 200 |  295.265574ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:20 | 200 |  166.335214ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:20 | 200 |  166.803242ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:21 | 200 |  259.505072ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:21 | 200 |  167.906408ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:21 | 200 |  166.511764ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:21 | 200 |  206.139104ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:21 | 200 |  167.362357ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:22 | 200 |  166.370259ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:22 | 200 |  227.774431ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:22 | 200 |  125.990828ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:22 | 200 |  168.339496ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:24 | 200 |  1.076270848s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.068-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.212-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.877-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.877-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.878-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 36923"
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.879-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.879-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:14:24 launchpad ollama[1754]: time=2025-04-13T14:14:24.879-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:14:24 launchpad ollama[49752]: INFO [main] build info | build=0 commit="unknown" tid="139944885350400" timestamp=1744578864
+Apr 13 14:14:24 launchpad ollama[49752]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139944885350400" timestamp=1744578864 total_threads=16
+Apr 13 14:14:24 launchpad ollama[49752]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36923" tid="139944885350400" timestamp=1744578864
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:14:24 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:14:24 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:14:24 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:14:24 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:14:24 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:14:24 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:14:25 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: time=2025-04-13T14:14:25.175-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:14:25 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:14:25 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:14:25 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:14:25 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:14:25 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:14:26 launchpad ollama[49752]: INFO [main] model loaded | tid="139944885350400" timestamp=1744578866
+Apr 13 14:14:26 launchpad ollama[1754]: time=2025-04-13T14:14:26.178-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:14:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:14:43 | 200 | 18.938773782s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:15:23 launchpad ollama[1754]: time=2025-04-13T14:15:23.831-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="685.9 MiB"
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.537-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9727639552 required="6.2 GiB"
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.537-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.537-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.538-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37539"
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.538-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.539-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.539-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:15:24 launchpad ollama[55078]: INFO [main] build info | build=0 commit="unknown" tid="140495880671232" timestamp=1744578924
+Apr 13 14:15:24 launchpad ollama[55078]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140495880671232" timestamp=1744578924 total_threads=16
+Apr 13 14:15:24 launchpad ollama[55078]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37539" tid="140495880671232" timestamp=1744578924
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:15:24 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:15:24 launchpad ollama[1754]: time=2025-04-13T14:15:24.789-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:15:24 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:15:24 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:15:24 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:15:24 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:15:24 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:15:25 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:15:25 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:15:25 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:15:25 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:15:25 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:15:25 launchpad ollama[55078]: INFO [main] model loaded | tid="140495880671232" timestamp=1744578925
+Apr 13 14:15:25 launchpad ollama[1754]: time=2025-04-13T14:15:25.793-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:15:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:25 | 200 |  2.314727864s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:26 | 200 |  171.157155ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:27 | 200 |  1.210687591s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:27 | 200 |    89.89124ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:27 | 200 |  128.727309ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:28 | 200 |  1.286389493s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:28 | 200 |   92.404352ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:29 | 200 |  172.220329ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:29 | 200 |  803.624256ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:30 | 200 |   88.948481ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:30 | 200 |  168.204584ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:31 | 200 |  1.223560146s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:31 | 200 |   90.375008ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:31 | 200 |   172.41442ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:32 | 200 |   751.93462ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:32 | 200 |   89.094999ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:32 | 200 |  170.968699ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:33 | 200 |  841.495071ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:33 | 200 |   91.642477ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:33 | 200 |  169.916497ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:34 | 200 |  470.730912ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:34 | 200 |  129.556752ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:34 | 200 |  172.025034ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:35 | 200 |  296.047595ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:35 | 200 |  169.916791ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:35 | 200 |   170.46005ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:35 | 200 |   202.58232ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:35 | 200 |  173.493465ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:35 | 200 |  172.141211ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:36 | 200 |  512.778462ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:36 | 200 |   89.326113ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:36 | 200 |  169.889956ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:37 | 200 |  660.200756ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:37 | 200 |   88.097633ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:37 | 200 |  171.044793ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:38 | 200 |  473.604906ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:38 | 200 |  170.801219ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:38 | 200 |  170.458333ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:38 | 200 |  178.146144ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:38 | 200 |  168.480833ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:39 | 200 |  172.921608ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:39 | 200 |  678.462695ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:39 | 200 |    88.61989ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:40 | 200 |  171.022436ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:40 | 200 |  208.262025ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:40 | 200 |  170.280019ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:40 | 200 |   170.34308ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:41 | 200 |   699.01613ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:41 | 200 |  129.198849ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:41 | 200 |  170.989689ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:41 | 200 |  296.178441ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:42 | 200 |  170.188273ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:42 | 200 |  171.127482ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:42 | 200 |  263.126762ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:42 | 200 |  168.281645ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:42 | 200 |  170.153962ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:43 | 200 |  210.320573ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:43 | 200 |  169.909425ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:43 | 200 |  172.835727ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:43 | 200 |   228.19585ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:43 | 200 |  129.579855ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:44 | 200 |  171.002463ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:15:45 | 200 |  1.143145453s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:15:45 launchpad ollama[1754]: time=2025-04-13T14:15:45.336-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:15:45 launchpad ollama[1754]: time=2025-04-13T14:15:45.486-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.188-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.189-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.190-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 46877"
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.190-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.190-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.190-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:15:46 launchpad ollama[55222]: INFO [main] build info | build=0 commit="unknown" tid="140188558594048" timestamp=1744578946
+Apr 13 14:15:46 launchpad ollama[55222]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140188558594048" timestamp=1744578946 total_threads=16
+Apr 13 14:15:46 launchpad ollama[55222]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46877" tid="140188558594048" timestamp=1744578946
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:15:46 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:15:46 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:15:46 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:15:46 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:15:46 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:15:46 launchpad ollama[1754]: time=2025-04-13T14:15:46.492-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:15:46 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:15:47 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:15:47 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:15:47 launchpad ollama[55222]: INFO [main] model loaded | tid="140188558594048" timestamp=1744578947
+Apr 13 14:15:47 launchpad ollama[1754]: time=2025-04-13T14:15:47.496-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 13 14:16:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:16:03 | 200 | 18.466346928s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:17:17 launchpad ollama[1754]: time=2025-04-13T14:17:17.360-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="658.0 MiB"
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.061-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9719382016 required="6.2 GiB"
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.061-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.061-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.062-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41757"
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.063-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.063-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.063-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:17:18 launchpad ollama[60114]: INFO [main] build info | build=0 commit="unknown" tid="140619095724032" timestamp=1744579038
+Apr 13 14:17:18 launchpad ollama[60114]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140619095724032" timestamp=1744579038 total_threads=16
+Apr 13 14:17:18 launchpad ollama[60114]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41757" tid="140619095724032" timestamp=1744579038
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:17:18 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:17:18 launchpad ollama[1754]: time=2025-04-13T14:17:18.314-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:17:18 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:17:18 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:17:18 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:17:18 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:17:18 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:17:19 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:17:19 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:17:19 launchpad ollama[60114]: INFO [main] model loaded | tid="140619095724032" timestamp=1744579039
+Apr 13 14:17:19 launchpad ollama[1754]: time=2025-04-13T14:17:19.318-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:17:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:19 | 200 |  2.319222132s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:19 | 200 |   176.44222ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:20 | 200 |  1.063898495s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:20 | 200 |   97.456771ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:21 | 200 |  177.324342ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:22 | 200 |  1.066073961s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:22 | 200 |   95.389306ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:22 | 200 |  177.712901ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:23 | 200 |  778.405138ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:23 | 200 |   96.984762ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:23 | 200 |  178.013899ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:24 | 200 |   984.17276ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:24 | 200 |    96.39856ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:24 | 200 |  175.885764ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:25 | 200 |  707.201969ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:25 | 200 |   97.504063ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:25 | 200 |  177.951254ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:26 | 200 |  1.126825267s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:27 | 200 |   98.129709ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:27 | 200 |  177.913595ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:27 | 200 |   515.08808ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:27 | 200 |   96.557575ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:27 | 200 |  138.549689ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:28 | 200 |  254.054972ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:28 | 200 |  177.425297ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:28 | 200 |  138.834882ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:28 | 200 |  161.296773ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:28 | 200 |  137.164197ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:29 | 200 |  179.224766ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:29 | 200 |  472.491769ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:29 | 200 |  136.947242ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:29 | 200 |  178.195423ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:30 | 200 |  618.304068ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:30 | 200 |   98.411077ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:30 | 200 |  177.763165ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:31 | 200 |  455.585747ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:31 | 200 |  178.780445ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:31 | 200 |  180.004825ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:31 | 200 |  178.484068ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:31 | 200 |  179.638942ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:32 | 200 |  179.764081ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:32 | 200 |  671.106515ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:32 | 200 |   96.062121ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:33 | 200 |  179.441546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:33 | 200 |  208.985995ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:33 | 200 |  178.031172ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:33 | 200 |  179.445471ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:34 | 200 |  698.184353ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:34 | 200 |  138.240377ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:34 | 200 |  178.092645ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:35 | 200 |  294.519435ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:35 | 200 |  179.066968ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:35 | 200 |  177.470116ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:35 | 200 |  262.179455ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:35 | 200 |  178.519184ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:36 | 200 |  178.901163ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:36 | 200 |  210.928807ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:36 | 200 |  177.787493ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:36 | 200 |  176.451428ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:36 | 200 |  228.905948ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:37 | 200 |  137.155408ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:37 | 200 |  177.565041ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:38 | 200 |  1.417654354s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:17:38 launchpad ollama[1754]: time=2025-04-13T14:17:38.835-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:17:38 launchpad ollama[1754]: time=2025-04-13T14:17:38.977-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.663-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.663-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.664-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 34649"
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.664-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.664-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.664-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:17:39 launchpad ollama[60222]: INFO [main] build info | build=0 commit="unknown" tid="139952822145024" timestamp=1744579059
+Apr 13 14:17:39 launchpad ollama[60222]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139952822145024" timestamp=1744579059 total_threads=16
+Apr 13 14:17:39 launchpad ollama[60222]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34649" tid="139952822145024" timestamp=1744579059
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:17:39 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:17:39 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:17:39 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:17:39 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:17:39 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:17:39 launchpad ollama[1754]: time=2025-04-13T14:17:39.961-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:17:39 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:17:40 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:17:40 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:17:40 launchpad ollama[60222]: INFO [main] model loaded | tid="139952822145024" timestamp=1744579060
+Apr 13 14:17:40 launchpad ollama[1754]: time=2025-04-13T14:17:40.964-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:17:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:17:55 | 200 | 16.667132647s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:18:48 launchpad ollama[1754]: time=2025-04-13T14:18:48.847-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="663.6 MiB"
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.554-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9746251776 required="6.2 GiB"
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.554-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.554-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.555-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43257"
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.555-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.555-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.555-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:18:49 launchpad ollama[64159]: INFO [main] build info | build=0 commit="unknown" tid="140056496005120" timestamp=1744579129
+Apr 13 14:18:49 launchpad ollama[64159]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140056496005120" timestamp=1744579129 total_threads=16
+Apr 13 14:18:49 launchpad ollama[64159]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43257" tid="140056496005120" timestamp=1744579129
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:18:49 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:18:49 launchpad ollama[1754]: time=2025-04-13T14:18:49.806-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:18:49 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:18:49 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:18:49 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:18:49 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:18:49 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:18:50 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:18:50 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:18:50 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:18:50 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:18:50 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:18:50 launchpad ollama[64159]: INFO [main] model loaded | tid="140056496005120" timestamp=1744579130
+Apr 13 14:18:50 launchpad ollama[1754]: time=2025-04-13T14:18:50.810-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:18:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:50 | 200 |  2.313253254s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:51 | 200 |  166.536958ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:52 | 200 |  1.096044537s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:52 | 200 |   167.80689ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:52 | 200 |  167.278703ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:53 | 200 |  1.097934513s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:53 | 200 |   85.764496ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:53 | 200 |   171.23403ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:54 | 200 |  812.408253ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:54 | 200 |   86.286988ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:55 | 200 |  167.250607ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:56 | 200 |  1.079654534s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:56 | 200 |   86.127655ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:56 | 200 |  167.522467ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:57 | 200 |  745.915362ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:57 | 200 |   86.271624ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:57 | 200 |  167.038229ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:58 | 200 |   880.92768ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:58 | 200 |   87.246529ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:58 | 200 |  127.160666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:59 | 200 |  467.429797ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:59 | 200 |  127.379213ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:59 | 200 |  166.864857ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:59 | 200 |   293.54243ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:18:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:18:59 | 200 |  167.623815ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:00 | 200 |  168.778638ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:00 | 200 |  200.019556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:00 | 200 |  168.020925ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:00 | 200 |  168.775122ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:01 | 200 |  459.824909ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:01 | 200 |   86.685794ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:01 | 200 |  168.194174ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:02 | 200 |  963.723567ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:02 | 200 |   87.690111ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:02 | 200 |  166.403028ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:02 | 200 |   454.23558ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:03 | 200 |  167.392722ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:03 | 200 |   168.62208ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:03 | 200 |  178.556716ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:03 | 200 |  167.639075ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:03 | 200 |  168.254254ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:04 | 200 |  885.088514ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:04 | 200 |    87.53913ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:05 | 200 |  167.769488ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:05 | 200 |  207.318696ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:05 | 200 |  172.321189ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:05 | 200 |  168.721826ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:06 | 200 |  692.854417ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:06 | 200 |  126.812033ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:06 | 200 |   168.76819ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:06 | 200 |  294.056719ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:07 | 200 |  167.403955ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:07 | 200 |  167.626091ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:07 | 200 |  259.491471ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:07 | 200 |  168.224123ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:07 | 200 |  167.666895ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:08 | 200 |  207.745311ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:08 | 200 |  166.485726ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:08 | 200 |  169.049776ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:08 | 200 |   226.26969ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:08 | 200 |  128.590684ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:09 | 200 |  166.536268ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:10 | 200 |  1.256961847s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:19:10 launchpad ollama[1754]: time=2025-04-13T14:19:10.439-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:19:10 launchpad ollama[1754]: time=2025-04-13T14:19:10.580-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.248-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.3 GiB" free_swap="68.9 GiB"
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.248-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.249-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 43895"
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.249-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.249-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.249-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:19:11 launchpad ollama[64264]: INFO [main] build info | build=0 commit="unknown" tid="139804451819520" timestamp=1744579151
+Apr 13 14:19:11 launchpad ollama[64264]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139804451819520" timestamp=1744579151 total_threads=16
+Apr 13 14:19:11 launchpad ollama[64264]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43895" tid="139804451819520" timestamp=1744579151
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:19:11 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:19:11 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:19:11 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:19:11 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:19:11 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:19:11 launchpad ollama[1754]: time=2025-04-13T14:19:11.557-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:19:11 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:19:12 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:19:12 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:19:12 launchpad ollama[64264]: INFO [main] model loaded | tid="139804451819520" timestamp=1744579152
+Apr 13 14:19:12 launchpad ollama[1754]: time=2025-04-13T14:19:12.561-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 13 14:19:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:19:29 | 200 | 19.527197996s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:22:07 launchpad ollama[1754]: time=2025-04-13T14:22:07.293-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="630.1 MiB"
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.017-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9711058944 required="6.2 GiB"
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.017-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.017-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.018-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39385"
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.018-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.018-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.018-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:22:08 launchpad ollama[69406]: INFO [main] build info | build=0 commit="unknown" tid="139926906634240" timestamp=1744579328
+Apr 13 14:22:08 launchpad ollama[69406]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139926906634240" timestamp=1744579328 total_threads=16
+Apr 13 14:22:08 launchpad ollama[69406]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39385" tid="139926906634240" timestamp=1744579328
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:22:08 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:22:08 launchpad ollama[1754]: time=2025-04-13T14:22:08.269-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:22:08 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:22:08 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:22:08 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:22:08 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:22:08 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:22:08 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:22:09 launchpad ollama[69406]: INFO [main] model loaded | tid="139926906634240" timestamp=1744579329
+Apr 13 14:22:09 launchpad ollama[1754]: time=2025-04-13T14:22:09.273-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:22:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:09 | 200 |  2.353418297s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:09 | 200 |  188.397914ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:10 | 200 |  1.191260053s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:10 | 200 |   69.236879ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:11 | 200 |   189.68103ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:12 | 200 |  1.273561948s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:12 | 200 |   110.35195ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:12 | 200 |  189.628454ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:13 | 200 |  774.245212ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:13 | 200 |  108.251793ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:13 | 200 |  192.123101ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:14 | 200 |  1.016973079s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:14 | 200 |  109.635079ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:15 | 200 |   190.88805ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:15 | 200 |  701.048332ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:16 | 200 |  109.971343ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:16 | 200 |  191.383176ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:17 | 200 |  1.094546905s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:17 | 200 |   67.951011ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:17 | 200 |  189.827893ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:18 | 200 |  510.153815ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:18 | 200 |  148.625137ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:18 | 200 |  190.730312ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:18 | 200 |  293.901948ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:18 | 200 |   193.32415ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:19 | 200 |   191.02034ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:19 | 200 |  201.141419ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:19 | 200 |  190.349755ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:19 | 200 |  193.392909ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:20 | 200 |  460.750557ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:20 | 200 |   108.52019ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:20 | 200 |   190.12915ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:21 | 200 |  616.629895ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:21 | 200 |  108.459063ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:21 | 200 |  191.335463ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:21 | 200 |  454.891338ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:22 | 200 |  190.275966ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:22 | 200 |  190.391963ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:22 | 200 |  179.505602ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:22 | 200 |  191.355618ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:22 | 200 |  189.507433ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:23 | 200 |  886.072943ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:23 | 200 |   109.44457ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:24 | 200 |  190.639759ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:24 | 200 |  207.415035ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:24 | 200 |  190.483674ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:24 | 200 |  190.162139ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:25 | 200 |  733.686797ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:25 | 200 |  148.325524ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:25 | 200 |  190.494525ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:26 | 200 |  294.843235ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:26 | 200 |  190.511629ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:26 | 200 |  189.528435ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:26 | 200 |  260.039953ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:26 | 200 |  191.069802ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:27 | 200 |  190.340179ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:27 | 200 |  210.637842ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:27 | 200 |  190.887445ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:27 | 200 |  190.896501ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:27 | 200 |  227.558483ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:28 | 200 |  150.969037ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:28 | 200 |  191.919331ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:29 | 200 |  1.447125893s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.028-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.175-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.860-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.860-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.861-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 38375"
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.861-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.861-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:22:30 launchpad ollama[1754]: time=2025-04-13T14:22:30.861-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:22:30 launchpad ollama[69518]: INFO [main] build info | build=0 commit="unknown" tid="140663304839168" timestamp=1744579350
+Apr 13 14:22:30 launchpad ollama[69518]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140663304839168" timestamp=1744579350 total_threads=16
+Apr 13 14:22:30 launchpad ollama[69518]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38375" tid="140663304839168" timestamp=1744579350
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:22:30 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:22:30 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:22:30 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:22:30 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:22:30 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:22:30 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: time=2025-04-13T14:22:31.156-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:22:31 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:22:31 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:22:31 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:22:31 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:22:31 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:22:31 launchpad ollama[69518]: INFO [main] model loaded | tid="140663304839168" timestamp=1744579351
+Apr 13 14:22:32 launchpad ollama[1754]: time=2025-04-13T14:22:32.160-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:22:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:22:42 | 200 | 12.304260015s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:25:45 launchpad ollama[1754]: time=2025-04-13T14:25:45.550-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="627.3 MiB"
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.268-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9722855424 required="6.2 GiB"
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.268-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.268-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.269-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40747"
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.269-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.269-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.270-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:25:46 launchpad ollama[72393]: INFO [main] build info | build=0 commit="unknown" tid="140099103772672" timestamp=1744579546
+Apr 13 14:25:46 launchpad ollama[72393]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140099103772672" timestamp=1744579546 total_threads=16
+Apr 13 14:25:46 launchpad ollama[72393]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40747" tid="140099103772672" timestamp=1744579546
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:25:46 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:25:46 launchpad ollama[1754]: time=2025-04-13T14:25:46.521-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:25:46 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:25:46 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:25:46 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:25:46 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:25:46 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:25:47 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:25:47 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:25:47 launchpad ollama[72393]: INFO [main] model loaded | tid="140099103772672" timestamp=1744579547
+Apr 13 14:25:47 launchpad ollama[1754]: time=2025-04-13T14:25:47.524-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:25:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:47 | 200 |  2.341834992s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:47 | 200 |  190.937335ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:49 | 200 |  1.230380296s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:49 | 200 |    109.7556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:49 | 200 |  189.152872ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:50 | 200 |  1.271804326s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:50 | 200 |  110.115751ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:51 | 200 |  192.545603ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:51 | 200 |  776.426211ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:51 | 200 |  108.764611ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:52 | 200 |  190.287817ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:53 | 200 |  1.199677852s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:53 | 200 |  109.310912ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:53 | 200 |  190.554585ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:54 | 200 |  704.312548ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:54 | 200 |  108.305441ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:54 | 200 |  191.776949ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:55 | 200 |  665.574641ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:55 | 200 |   109.39452ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:55 | 200 |  190.428114ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:56 | 200 |  470.393571ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:56 | 200 |  151.581252ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:56 | 200 |  190.686979ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:56 | 200 |  293.769644ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:57 | 200 |  191.687966ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:57 | 200 |  189.609013ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:57 | 200 |  200.416672ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:57 | 200 |  189.762234ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:57 | 200 |  190.790831ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:58 | 200 |  501.043779ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:58 | 200 |   109.75825ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:58 | 200 |  191.522784ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:59 | 200 |  661.077699ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:59 | 200 |  109.085825ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:25:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:25:59 | 200 |  191.706205ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:00 | 200 |  482.784477ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:00 | 200 |  152.737928ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:00 | 200 |  194.511255ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:00 | 200 |  180.820352ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:00 | 200 |   193.58614ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:01 | 200 |  192.450955ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:01 | 200 |  891.624352ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:02 | 200 |  110.938891ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:02 | 200 |  191.540928ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:02 | 200 |  209.958807ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:02 | 200 |  195.275862ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:02 | 200 |  193.021109ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:03 | 200 |  711.733195ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:03 | 200 |  194.004708ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:04 | 200 |  192.805524ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:04 | 200 |  295.026947ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:04 | 200 |  191.531509ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:04 | 200 |  191.688582ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:04 | 200 |  261.151826ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:05 | 200 |  192.674942ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:05 | 200 |  192.044451ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:05 | 200 |  211.177455ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:05 | 200 |  193.054096ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:05 | 200 |  191.187125ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:06 | 200 |  229.448972ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:06 | 200 |  151.241535ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:06 | 200 |  192.661309ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:08 | 200 |    1.3791092s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:26:08 launchpad ollama[1754]: time=2025-04-13T14:26:08.196-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:26:08 launchpad ollama[1754]: time=2025-04-13T14:26:08.340-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.2 GiB"
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.063-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.063-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.064-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 33513"
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.064-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.064-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.064-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:26:09 launchpad ollama[72590]: INFO [main] build info | build=0 commit="unknown" tid="140251896848384" timestamp=1744579569
+Apr 13 14:26:09 launchpad ollama[72590]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140251896848384" timestamp=1744579569 total_threads=16
+Apr 13 14:26:09 launchpad ollama[72590]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33513" tid="140251896848384" timestamp=1744579569
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:26:09 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:26:09 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:26:09 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:26:09 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:26:09 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:26:09 launchpad ollama[1754]: time=2025-04-13T14:26:09.365-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:26:09 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:26:10 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:26:10 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:26:10 launchpad ollama[72590]: INFO [main] model loaded | tid="140251896848384" timestamp=1744579570
+Apr 13 14:26:10 launchpad ollama[1754]: time=2025-04-13T14:26:10.369-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:26:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:26:27 | 200 | 19.373967531s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:27:35 launchpad ollama[1754]: time=2025-04-13T14:27:35.435-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="602.6 MiB"
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.128-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9682288640 required="6.2 GiB"
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.128-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.128-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.130-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46831"
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.130-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.130-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.130-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:27:36 launchpad ollama[77172]: INFO [main] build info | build=0 commit="unknown" tid="140343862452224" timestamp=1744579656
+Apr 13 14:27:36 launchpad ollama[77172]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140343862452224" timestamp=1744579656 total_threads=16
+Apr 13 14:27:36 launchpad ollama[77172]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46831" tid="140343862452224" timestamp=1744579656
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:27:36 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:27:36 launchpad ollama[1754]: time=2025-04-13T14:27:36.382-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:27:36 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:27:36 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:27:36 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:27:36 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:27:36 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:27:37 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:27:37 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:27:37 launchpad ollama[77172]: INFO [main] model loaded | tid="140343862452224" timestamp=1744579657
+Apr 13 14:27:37 launchpad ollama[1754]: time=2025-04-13T14:27:37.386-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:27:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:37 | 200 |  2.296962868s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:37 | 200 |  166.379237ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:38 | 200 |   1.21585603s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:39 | 200 |   46.271231ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:39 | 200 |  169.328422ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:40 | 200 |  1.262484925s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:40 | 200 |   87.147973ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:40 | 200 |   168.67223ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:41 | 200 |  808.248556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:41 | 200 |   86.403174ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:41 | 200 |  169.081056ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:43 | 200 |  1.189619467s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:43 | 200 |   86.619592ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:43 | 200 |  169.433753ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:43 | 200 |  696.491187ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:44 | 200 |   86.183906ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:44 | 200 |  167.617281ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:45 | 200 |   855.40085ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:45 | 200 |   87.150238ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:45 | 200 |  166.972119ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:45 | 200 |  505.714136ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:46 | 200 |  129.234185ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:46 | 200 |  129.921396ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:46 | 200 |  293.501567ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:46 | 200 |  128.919895ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:46 | 200 |   128.06142ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:46 | 200 |  200.198491ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:47 | 200 |  169.974946ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:47 | 200 |  128.035911ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:47 | 200 |  427.187042ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:47 | 200 |  127.275077ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:48 | 200 |  168.862751ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:48 | 200 |  612.763069ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:48 | 200 |   85.513107ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:48 | 200 |  167.522892ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:49 | 200 |   448.75843ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:49 | 200 |  168.272483ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:49 | 200 |  169.193847ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:49 | 200 |  180.487149ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:50 | 200 |  166.390475ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:50 | 200 |    169.2984ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:50 | 200 |  704.896492ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:51 | 200 |  128.902658ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:51 | 200 |  167.052734ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:51 | 200 |  207.566074ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:51 | 200 |  167.267084ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:51 | 200 |  167.128932ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:52 | 200 |  687.098121ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:52 | 200 |  126.909354ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:52 | 200 |  172.657005ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:53 | 200 |  292.500597ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:53 | 200 |  166.074656ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:53 | 200 |  167.533832ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:53 | 200 |  257.505546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:53 | 200 |  167.795423ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:54 | 200 |   167.70183ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:54 | 200 |  207.628208ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:54 | 200 |  169.192335ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:54 | 200 |   169.29525ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:54 | 200 |  226.537473ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:55 | 200 |   127.81057ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:55 | 200 |  166.385999ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:27:56 | 200 |  1.192449867s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:27:56 launchpad ollama[1754]: time=2025-04-13T14:27:56.579-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:27:56 launchpad ollama[1754]: time=2025-04-13T14:27:56.720-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.408-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.408-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.409-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 37371"
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.410-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.410-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.410-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:27:57 launchpad ollama[77315]: INFO [main] build info | build=0 commit="unknown" tid="140009756614656" timestamp=1744579677
+Apr 13 14:27:57 launchpad ollama[77315]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140009756614656" timestamp=1744579677 total_threads=16
+Apr 13 14:27:57 launchpad ollama[77315]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37371" tid="140009756614656" timestamp=1744579677
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:27:57 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:27:57 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:27:57 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:27:57 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:27:57 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:27:57 launchpad ollama[1754]: time=2025-04-13T14:27:57.704-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:27:57 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:27:58 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:27:58 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:27:58 launchpad ollama[77315]: INFO [main] model loaded | tid="140009756614656" timestamp=1744579678
+Apr 13 14:27:58 launchpad ollama[1754]: time=2025-04-13T14:27:58.708-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:28:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:28:16 | 200 | 19.436705671s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:30:17 launchpad ollama[1754]: time=2025-04-13T14:30:17.991-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="616.6 MiB"
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.691-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9715843072 required="6.2 GiB"
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.691-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.692-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.693-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39737"
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.693-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.693-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.693-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:30:18 launchpad ollama[82117]: INFO [main] build info | build=0 commit="unknown" tid="140471656603648" timestamp=1744579818
+Apr 13 14:30:18 launchpad ollama[82117]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140471656603648" timestamp=1744579818 total_threads=16
+Apr 13 14:30:18 launchpad ollama[82117]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39737" tid="140471656603648" timestamp=1744579818
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:30:18 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:30:18 launchpad ollama[1754]: time=2025-04-13T14:30:18.945-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:30:18 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:30:18 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:30:18 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:30:18 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:30:18 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:30:19 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:30:19 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:30:19 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:30:19 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:30:19 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:30:19 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:30:19 launchpad ollama[82117]: INFO [main] model loaded | tid="140471656603648" timestamp=1744579819
+Apr 13 14:30:19 launchpad ollama[1754]: time=2025-04-13T14:30:19.948-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:30:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:20 | 200 |  2.302116458s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:20 | 200 |  173.378433ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:21 | 200 |  1.192322595s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:21 | 200 |   45.186387ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:21 | 200 |  168.571201ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:22 | 200 |  1.232066125s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:23 | 200 |   85.839851ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:23 | 200 |   166.24997ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:24 | 200 |   768.46796ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:24 | 200 |   87.257239ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:24 | 200 |   167.29942ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:25 | 200 |  1.189202392s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:25 | 200 |   87.234786ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:25 | 200 |   166.58056ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:26 | 200 |  738.995533ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:26 | 200 |   86.575609ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:26 | 200 |  168.204617ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:27 | 200 |  822.510179ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:27 | 200 |   86.251429ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:27 | 200 |  166.460825ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:28 | 200 |  506.095655ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:28 | 200 |  126.363774ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:28 | 200 |  167.313687ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:28 | 200 |  290.438735ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:29 | 200 |  167.591176ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:29 | 200 |  167.484531ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:29 | 200 |  201.958578ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:29 | 200 |  165.800071ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:29 | 200 |  170.149609ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:30 | 200 |  458.605678ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:30 | 200 |   86.575035ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:30 | 200 |  168.349579ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:31 | 200 |  613.360404ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:31 | 200 |   87.786444ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:31 | 200 |  167.704186ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:31 | 200 |  450.809127ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:32 | 200 |  166.860723ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:32 | 200 |  168.386827ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:32 | 200 |   177.15343ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:32 | 200 |  170.958186ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:32 | 200 |  172.114221ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:33 | 200 |  712.021454ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:33 | 200 |   86.313181ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:33 | 200 |   167.87855ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:34 | 200 |  208.042758ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:34 | 200 |  168.749456ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:34 | 200 |  168.356369ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:35 | 200 |  704.063972ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:35 | 200 |  127.699159ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:35 | 200 |  168.709584ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:35 | 200 |   294.68536ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:35 | 200 |  167.959617ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:36 | 200 |  167.509728ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:36 | 200 |  263.176951ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:36 | 200 |  168.508643ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:36 | 200 |  168.436998ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:36 | 200 |  210.926308ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:37 | 200 |  167.421001ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:37 | 200 |  167.531312ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:37 | 200 |  230.263169ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:37 | 200 |  127.996815ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:38 | 200 |   167.05686ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:30:39 | 200 |  1.466588283s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:30:39 launchpad ollama[1754]: time=2025-04-13T14:30:39.618-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:30:39 launchpad ollama[1754]: time=2025-04-13T14:30:39.762-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.461-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.2 GiB" free_swap="68.9 GiB"
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.461-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.462-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 39957"
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.462-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.462-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.463-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:30:40 launchpad ollama[82246]: INFO [main] build info | build=0 commit="unknown" tid="139794672869376" timestamp=1744579840
+Apr 13 14:30:40 launchpad ollama[82246]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139794672869376" timestamp=1744579840 total_threads=16
+Apr 13 14:30:40 launchpad ollama[82246]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39957" tid="139794672869376" timestamp=1744579840
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:30:40 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:30:40 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:30:40 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:30:40 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:30:40 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:30:40 launchpad ollama[1754]: time=2025-04-13T14:30:40.759-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:30:40 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:30:41 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:30:41 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:30:41 launchpad ollama[82246]: INFO [main] model loaded | tid="139794672869376" timestamp=1744579841
+Apr 13 14:30:41 launchpad ollama[1754]: time=2025-04-13T14:30:41.762-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:31:15 | 200 | 36.216183489s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:34:40 launchpad ollama[1754]: time=2025-04-13T14:34:40.036-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:34:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:34:43 | 200 |  3.813950747s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.103-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="584.3 MiB"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.803-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9667280896 required="6.2 GiB"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.803-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.803-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.804-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44545"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.804-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.804-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:34:53 launchpad ollama[1754]: time=2025-04-13T14:34:53.805-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:34:53 launchpad ollama[93210]: INFO [main] build info | build=0 commit="unknown" tid="139931029549056" timestamp=1744580093
+Apr 13 14:34:53 launchpad ollama[93210]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139931029549056" timestamp=1744580093 total_threads=16
+Apr 13 14:34:53 launchpad ollama[93210]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44545" tid="139931029549056" timestamp=1744580093
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:34:53 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:34:54 launchpad ollama[1754]: time=2025-04-13T14:34:54.056-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:34:54 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:34:54 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:34:54 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:34:54 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:34:54 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:34:54 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:34:54 launchpad ollama[93210]: INFO [main] model loaded | tid="139931029549056" timestamp=1744580094
+Apr 13 14:34:55 launchpad ollama[1754]: time=2025-04-13T14:34:55.060-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:34:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:34:58 | 200 |  5.802050132s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:34:58 launchpad ollama[1754]: time=2025-04-13T14:34:58.887-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.028-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.3 GiB"
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.764-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.765-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.766-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 38355"
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.766-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.766-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:34:59 launchpad ollama[1754]: time=2025-04-13T14:34:59.766-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:34:59 launchpad ollama[93250]: INFO [main] build info | build=0 commit="unknown" tid="139818907459584" timestamp=1744580099
+Apr 13 14:34:59 launchpad ollama[93250]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139818907459584" timestamp=1744580099 total_threads=16
+Apr 13 14:34:59 launchpad ollama[93250]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38355" tid="139818907459584" timestamp=1744580099
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:34:59 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:34:59 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:34:59 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:34:59 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:34:59 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:34:59 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: time=2025-04-13T14:35:00.066-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:35:00 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:35:00 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:35:00 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:35:00 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:35:00 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:35:00 launchpad ollama[93250]: INFO [main] model loaded | tid="139818907459584" timestamp=1744580100
+Apr 13 14:35:01 launchpad ollama[1754]: time=2025-04-13T14:35:01.069-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:35:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:10 | 200 | 11.258725546s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:35:10 launchpad ollama[1754]: time=2025-04-13T14:35:10.314-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="608.3 MiB"
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.026-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9667280896 required="6.2 GiB"
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.027-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.027-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.028-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43095"
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.028-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.028-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.028-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:35:11 launchpad ollama[95674]: INFO [main] build info | build=0 commit="unknown" tid="140445655883776" timestamp=1744580111
+Apr 13 14:35:11 launchpad ollama[95674]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140445655883776" timestamp=1744580111 total_threads=16
+Apr 13 14:35:11 launchpad ollama[95674]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43095" tid="140445655883776" timestamp=1744580111
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:35:11 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:35:11 launchpad ollama[1754]: time=2025-04-13T14:35:11.279-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:35:11 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:35:11 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:35:11 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:35:11 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:35:11 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:35:11 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:35:11 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:35:11 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:35:11 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:35:11 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:35:11 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:35:12 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:35:12 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:35:12 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:35:12 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:35:12 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:35:12 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:35:12 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:35:12 launchpad ollama[95674]: INFO [main] model loaded | tid="140445655883776" timestamp=1744580112
+Apr 13 14:35:12 launchpad ollama[1754]: time=2025-04-13T14:35:12.282-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:35:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:12 | 200 |  2.314496336s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:12 | 200 |  178.116447ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:13 | 200 |    1.1959879s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:13 | 200 |   96.382327ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:14 | 200 |  180.869841ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:15 | 200 |   1.23697766s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:15 | 200 |   96.504888ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:15 | 200 |  177.413049ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:16 | 200 |  776.157858ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:16 | 200 |   97.160568ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:16 | 200 |  178.191536ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:17 | 200 |  1.144426595s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:18 | 200 |   96.176572ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:18 | 200 |  178.716816ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:18 | 200 |  742.293388ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:19 | 200 |   98.626052ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:19 | 200 |  180.902001ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:20 | 200 |  962.743394ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:20 | 200 |   56.872956ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:20 | 200 |  177.506766ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:20 | 200 |  467.377203ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:21 | 200 |  139.222363ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:21 | 200 |  179.569167ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:21 | 200 |  293.263195ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:21 | 200 |  180.419679ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:21 | 200 |  178.266896ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:22 | 200 |  201.409315ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:22 | 200 |  180.054981ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:22 | 200 |  179.706601ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:22 | 200 |  459.576033ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:23 | 200 |   98.006545ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:23 | 200 |  178.718313ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:23 | 200 |  615.969164ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:24 | 200 |   97.232233ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:24 | 200 |  179.183213ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:24 | 200 |   492.92924ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:24 | 200 |  136.994971ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:25 | 200 |  177.437023ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:25 | 200 |  179.372754ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:25 | 200 |  178.921879ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:25 | 200 |  176.965531ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:26 | 200 |   666.45808ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:26 | 200 |   96.754528ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:26 | 200 |  179.713556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:26 | 200 |  207.324831ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:26 | 200 |  177.892347ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:27 | 200 |  178.809212ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:27 | 200 |   692.82106ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:28 | 200 |  178.176721ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:28 | 200 |  179.007007ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:28 | 200 |  294.518091ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:28 | 200 |  178.252741ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:28 | 200 |  179.286781ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:29 | 200 |  260.886672ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:29 | 200 |  178.373155ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:29 | 200 |   178.31738ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:29 | 200 |  208.019923ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:29 | 200 |  179.314934ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:30 | 200 |  177.517472ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:30 | 200 |  227.922241ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:30 | 200 |   137.51217ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:30 | 200 |  176.991731ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:32 | 200 |  1.458218137s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:32 | 200 |   56.194729ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:32 | 200 |  138.109264ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:33 | 200 |  1.312072758s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:35:33 launchpad ollama[1754]: time=2025-04-13T14:35:33.848-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:35:33 launchpad ollama[1754]: time=2025-04-13T14:35:33.990-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.2 GiB"
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.665-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.666-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.667-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 42123"
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.667-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.667-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.667-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:35:34 launchpad ollama[95820]: INFO [main] build info | build=0 commit="unknown" tid="140547730632704" timestamp=1744580134
+Apr 13 14:35:34 launchpad ollama[95820]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140547730632704" timestamp=1744580134 total_threads=16
+Apr 13 14:35:34 launchpad ollama[95820]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42123" tid="140547730632704" timestamp=1744580134
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:35:34 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:35:34 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:35:34 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:35:34 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:35:34 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:35:34 launchpad ollama[1754]: time=2025-04-13T14:35:34.975-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:35:34 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:35:35 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:35:35 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:35:35 launchpad ollama[95820]: INFO [main] model loaded | tid="140547730632704" timestamp=1744580135
+Apr 13 14:35:35 launchpad ollama[1754]: time=2025-04-13T14:35:35.978-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 13 14:35:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:35:47 | 200 |   13.4673587s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:37:21 launchpad ollama[1754]: time=2025-04-13T14:37:21.968-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="544.4 MiB"
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.669-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9640083456 required="6.2 GiB"
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.669-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.669-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.670-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46639"
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.670-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.670-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.670-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:37:22 launchpad ollama[98703]: INFO [main] build info | build=0 commit="unknown" tid="140693713473536" timestamp=1744580242
+Apr 13 14:37:22 launchpad ollama[98703]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140693713473536" timestamp=1744580242 total_threads=16
+Apr 13 14:37:22 launchpad ollama[98703]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46639" tid="140693713473536" timestamp=1744580242
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:37:22 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:37:22 launchpad ollama[1754]: time=2025-04-13T14:37:22.922-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:37:22 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:37:22 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:37:22 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:37:22 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:37:22 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:37:23 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:37:23 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:37:23 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:37:23 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:37:23 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:37:23 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:37:23 launchpad ollama[98703]: INFO [main] model loaded | tid="140693713473536" timestamp=1744580243
+Apr 13 14:37:23 launchpad ollama[1754]: time=2025-04-13T14:37:23.925-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:37:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:24 | 200 |  2.315089317s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:24 | 200 |   166.59957ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:25 | 200 |  980.672758ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:25 | 200 |   85.987893ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:25 | 200 |  166.924182ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:26 | 200 |  975.715946ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:26 | 200 |  166.199779ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:26 | 200 |  168.077027ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:27 | 200 |  771.804659ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:27 | 200 |   85.363072ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:27 | 200 |  168.148106ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:28 | 200 |  1.073632492s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:29 | 200 |   86.120328ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:29 | 200 |  165.991056ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:30 | 200 |  738.237636ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:30 | 200 |   86.471263ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:30 | 200 |  168.129891ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:31 | 200 |  1.029468497s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:31 | 200 |  126.173962ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:31 | 200 |  166.358133ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:32 | 200 |  467.155855ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:32 | 200 |  127.054223ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:32 | 200 |  167.369857ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:32 | 200 |   292.33407ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:32 | 200 |  166.069377ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:33 | 200 |  165.470243ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:33 | 200 |   200.73332ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:33 | 200 |  166.734724ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:33 | 200 |  167.481937ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:34 | 200 |  499.446187ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:34 | 200 |   127.40549ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:34 | 200 |  166.312732ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:35 | 200 |  998.216136ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:35 | 200 |    84.12942ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:35 | 200 |  165.787686ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:36 | 200 |  452.683624ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:36 | 200 |  166.119401ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:36 | 200 |  167.372974ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:36 | 200 |  176.392607ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:36 | 200 |  167.712265ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:37 | 200 |  166.020926ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:37 | 200 |  881.070099ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:37 | 200 |   84.837646ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:38 | 200 |  168.355874ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:38 | 200 |   207.20885ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:38 | 200 |  170.129702ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:38 | 200 |  167.576262ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:39 | 200 |  730.203904ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:39 | 200 |     126.269ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:39 | 200 |  167.988504ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:40 | 200 |  294.312082ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:40 | 200 |  166.463824ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:40 | 200 |   166.23161ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:40 | 200 |  259.133759ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:40 | 200 |  167.989997ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:41 | 200 |    166.0267ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:41 | 200 |  207.488415ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:41 | 200 |  166.309144ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:41 | 200 |  166.291741ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:41 | 200 |   229.73985ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:42 | 200 |   125.80907ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:42 | 200 |  170.979991ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:37:43 | 200 |  1.271821762s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:37:43 launchpad ollama[1754]: time=2025-04-13T14:37:43.613-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:37:43 launchpad ollama[1754]: time=2025-04-13T14:37:43.753-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.2 GiB"
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.470-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.9 GiB" free_swap="68.9 GiB"
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.471-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.472-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 37701"
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.472-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.472-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.473-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:37:44 launchpad ollama[98819]: INFO [main] build info | build=0 commit="unknown" tid="139752232804352" timestamp=1744580264
+Apr 13 14:37:44 launchpad ollama[98819]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139752232804352" timestamp=1744580264 total_threads=16
+Apr 13 14:37:44 launchpad ollama[98819]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37701" tid="139752232804352" timestamp=1744580264
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:37:44 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:37:44 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:37:44 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:37:44 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:37:44 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:37:44 launchpad ollama[1754]: time=2025-04-13T14:37:44.786-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:37:44 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:37:45 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:37:45 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:37:45 launchpad ollama[98819]: INFO [main] model loaded | tid="139752232804352" timestamp=1744580265
+Apr 13 14:37:45 launchpad ollama[1754]: time=2025-04-13T14:37:45.790-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.32 seconds"
+Apr 13 14:38:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:38:01 | 200 |  18.27445792s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.128-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="534.2 MiB"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.830-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9610526720 required="6.2 GiB"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.831-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.831-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.832-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41577"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.832-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.832-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:39:07 launchpad ollama[1754]: time=2025-04-13T14:39:07.832-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:39:07 launchpad ollama[103173]: INFO [main] build info | build=0 commit="unknown" tid="140652930228224" timestamp=1744580347
+Apr 13 14:39:07 launchpad ollama[103173]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140652930228224" timestamp=1744580347 total_threads=16
+Apr 13 14:39:07 launchpad ollama[103173]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41577" tid="140652930228224" timestamp=1744580347
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:39:07 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:39:08 launchpad ollama[1754]: time=2025-04-13T14:39:08.083-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:39:08 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:39:08 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:39:08 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:39:08 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:39:08 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:39:08 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:39:08 launchpad ollama[103173]: INFO [main] model loaded | tid="140652930228224" timestamp=1744580348
+Apr 13 14:39:09 launchpad ollama[1754]: time=2025-04-13T14:39:09.087-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:39:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:09 | 200 |  2.329335999s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:09 | 200 |  169.926374ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:10 | 200 |  1.132767102s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:10 | 200 |  130.829894ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:10 | 200 |  172.632389ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:12 | 200 |  1.130264781s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:12 | 200 |   90.205289ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:12 | 200 |  172.187878ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:13 | 200 |  805.233297ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:13 | 200 |   88.955386ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:13 | 200 |  171.941174ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:14 | 200 |  973.317608ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:14 | 200 |   90.134204ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:14 | 200 |  173.875104ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:15 | 200 |  707.457795ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:15 | 200 |   89.656754ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:15 | 200 |  173.209523ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:16 | 200 |  929.248156ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:16 | 200 |  131.001807ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:16 | 200 |  172.770244ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:17 | 200 |  464.639723ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:17 | 200 |  130.222601ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:17 | 200 |  171.325807ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:18 | 200 |  291.187917ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:18 | 200 |  170.831525ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:18 | 200 |  172.566722ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:18 | 200 |  201.040661ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:18 | 200 |  171.786778ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:18 | 200 |  169.581804ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:19 | 200 |  497.590619ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:19 | 200 |   90.045003ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:19 | 200 |  171.106939ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:20 | 200 |  651.564659ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:20 | 200 |   90.527095ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:20 | 200 |  170.741918ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:21 | 200 |  493.252642ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:21 | 200 |  129.900256ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:21 | 200 |  171.470108ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:21 | 200 |  177.588201ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:21 | 200 |  170.188466ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:21 | 200 |  172.863902ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:22 | 200 |  664.346179ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:22 | 200 |   90.436919ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:22 | 200 |  171.593795ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:23 | 200 |   206.75895ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:23 | 200 |  171.253077ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:23 | 200 |  172.073231ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:24 | 200 |  687.208201ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:24 | 200 |   129.81413ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:24 | 200 |  171.588471ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:24 | 200 |  294.572335ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:24 | 200 |  172.780845ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:25 | 200 |  171.341557ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:25 | 200 |  258.210917ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:25 | 200 |  170.748898ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:25 | 200 |   172.21863ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:25 | 200 |  208.696254ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:26 | 200 |  170.215091ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:26 | 200 |  171.483074ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:26 | 200 |  226.776643ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:26 | 200 |  131.068224ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:27 | 200 |  171.629924ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:27 | 200 |   755.27518ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:39:27 launchpad ollama[1754]: time=2025-04-13T14:39:27.861-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.002-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.2 GiB"
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.700-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="56.0 GiB" free_swap="68.9 GiB"
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.701-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.701-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 43299"
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.702-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.702-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:39:28 launchpad ollama[1754]: time=2025-04-13T14:39:28.702-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:39:28 launchpad ollama[103282]: INFO [main] build info | build=0 commit="unknown" tid="140375132950528" timestamp=1744580368
+Apr 13 14:39:28 launchpad ollama[103282]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140375132950528" timestamp=1744580368 total_threads=16
+Apr 13 14:39:28 launchpad ollama[103282]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43299" tid="140375132950528" timestamp=1744580368
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:39:28 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:39:28 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:39:28 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:39:28 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:39:28 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:39:28 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: time=2025-04-13T14:39:29.000-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:39:29 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:39:29 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:39:29 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:39:29 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:39:29 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:39:29 launchpad ollama[103282]: INFO [main] model loaded | tid="140375132950528" timestamp=1744580369
+Apr 13 14:39:30 launchpad ollama[1754]: time=2025-04-13T14:39:30.003-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:39:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:39:50 | 200 | 22.534664776s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:42:51 launchpad ollama[1754]: time=2025-04-13T14:42:51.522-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="497.8 MiB"
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.227-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9593356288 required="6.2 GiB"
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.227-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.227-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.228-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36823"
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.228-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.228-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.228-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:42:52 launchpad ollama[109510]: INFO [main] build info | build=0 commit="unknown" tid="139823987769344" timestamp=1744580572
+Apr 13 14:42:52 launchpad ollama[109510]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139823987769344" timestamp=1744580572 total_threads=16
+Apr 13 14:42:52 launchpad ollama[109510]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36823" tid="139823987769344" timestamp=1744580572
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:42:52 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:42:52 launchpad ollama[1754]: time=2025-04-13T14:42:52.480-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:42:52 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:42:52 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:42:52 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:42:52 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:42:52 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:42:53 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:42:53 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:42:53 launchpad ollama[109510]: INFO [main] model loaded | tid="139823987769344" timestamp=1744580573
+Apr 13 14:42:53 launchpad ollama[1754]: time=2025-04-13T14:42:53.483-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:42:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:53 | 200 |  2.306454143s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:53 | 200 |  165.621165ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:55 | 200 |  1.227310829s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:55 | 200 |  125.410836ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:55 | 200 |  165.467317ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:56 | 200 |  1.224884307s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:56 | 200 |    84.37337ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:56 | 200 |  169.247118ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:57 | 200 |  815.151353ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:57 | 200 |   86.078562ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:57 | 200 |  166.381371ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:58 | 200 |   979.58049ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:59 | 200 |   84.717161ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:59 | 200 |  168.557701ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:42:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:42:59 | 200 |  700.521491ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:00 | 200 |   87.101051ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:00 | 200 |  167.423538ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:01 | 200 |  859.046988ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:01 | 200 |   84.606035ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:01 | 200 |  167.785209ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:01 | 200 |  508.101419ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:01 | 200 |    86.04045ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:02 | 200 |  126.325971ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:02 | 200 |  253.398113ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:02 | 200 |  129.163267ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:02 | 200 |  167.329112ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:02 | 200 |  201.597252ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:03 | 200 |  170.190455ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:03 | 200 |  126.588016ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:03 | 200 |  470.740276ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:03 | 200 |  125.755111ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:03 | 200 |   166.34965ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:04 | 200 |  614.579537ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:04 | 200 |   84.393243ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:04 | 200 |  167.553907ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:05 | 200 |  451.345767ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:05 | 200 |  168.560357ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:05 | 200 |  166.757286ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:05 | 200 |  176.534285ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:05 | 200 |   165.06565ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:06 | 200 |  167.382866ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:06 | 200 |  706.852649ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:06 | 200 |   84.817241ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:07 | 200 |  168.089006ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:07 | 200 |  208.238021ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:07 | 200 |   166.77963ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:07 | 200 |  167.257977ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:08 | 200 |    690.1052ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:08 | 200 |  125.983995ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:08 | 200 |  166.884318ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:08 | 200 |  292.994953ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:09 | 200 |  167.083415ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:09 | 200 |  168.358887ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:09 | 200 |  262.225932ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:09 | 200 |  166.610526ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:09 | 200 |  166.903469ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:10 | 200 |  208.484896ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:10 | 200 |  167.800929ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:10 | 200 |  168.639568ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:10 | 200 |  228.734388ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:11 | 200 |  127.559827ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:11 | 200 |  164.091957ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:12 | 200 |  1.103207366s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:43:12 launchpad ollama[1754]: time=2025-04-13T14:43:12.364-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:43:12 launchpad ollama[1754]: time=2025-04-13T14:43:12.511-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.2 GiB"
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.184-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.6 GiB" free_swap="68.9 GiB"
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.184-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=39 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.9 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.9 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.185-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 39 --parallel 1 --port 34477"
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.186-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.186-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.186-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:43:13 launchpad ollama[109650]: INFO [main] build info | build=0 commit="unknown" tid="139790480424960" timestamp=1744580593
+Apr 13 14:43:13 launchpad ollama[109650]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139790480424960" timestamp=1744580593 total_threads=16
+Apr 13 14:43:13 launchpad ollama[109650]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34477" tid="139790480424960" timestamp=1744580593
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:43:13 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:43:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:43:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:43:13 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:43:13 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:43:13 launchpad ollama[1754]: time=2025-04-13T14:43:13.482-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_tensors: offloading 39 repeating layers to GPU
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_tensors: offloaded 39/41 layers to GPU
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:43:13 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6637.62 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:43:14 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    40.00 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1560.00 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:43:14 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 15
+Apr 13 14:43:14 launchpad ollama[109650]: INFO [main] model loaded | tid="139790480424960" timestamp=1744580594
+Apr 13 14:43:14 launchpad ollama[1754]: time=2025-04-13T14:43:14.486-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:43:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:43:44 | 200 | 32.071065531s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.252-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="455.6 MiB"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.987-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9540665344 required="6.2 GiB"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.987-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.988-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.989-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45257"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.989-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.989-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:47:53 launchpad ollama[1754]: time=2025-04-13T14:47:53.989-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:47:54 launchpad ollama[119217]: INFO [main] build info | build=0 commit="unknown" tid="139852766101504" timestamp=1744580874
+Apr 13 14:47:54 launchpad ollama[119217]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139852766101504" timestamp=1744580874 total_threads=16
+Apr 13 14:47:54 launchpad ollama[119217]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45257" tid="139852766101504" timestamp=1744580874
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:47:54 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:47:54 launchpad ollama[1754]: time=2025-04-13T14:47:54.240-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:47:54 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:47:54 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:47:54 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:47:54 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:47:54 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:47:54 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:47:55 launchpad ollama[119217]: INFO [main] model loaded | tid="139852766101504" timestamp=1744580875
+Apr 13 14:47:55 launchpad ollama[1754]: time=2025-04-13T14:47:55.245-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 14:47:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:55 | 200 |  2.336176936s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:55 | 200 |  167.620546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:56 | 200 |  1.206048293s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:56 | 200 |    87.37967ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:57 | 200 |  165.898174ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:58 | 200 |  1.247195317s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:58 | 200 |    86.67248ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:58 | 200 |  165.470134ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:59 | 200 |  817.522098ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:59 | 200 |    83.67877ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:47:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:47:59 | 200 |   170.88948ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:00 | 200 |  1.202322985s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:00 | 200 |   85.512998ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:01 | 200 |  168.123297ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:01 | 200 |  743.007025ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:02 | 200 |   84.748842ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:02 | 200 |  165.219503ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:03 | 200 |  833.215833ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:03 | 200 |   86.181899ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:03 | 200 |  168.187141ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:03 | 200 |  510.773263ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:03 | 200 |   86.410317ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:04 | 200 |  165.704753ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:04 | 200 |  292.947285ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:04 | 200 |  167.087546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:04 | 200 |  167.075167ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:04 | 200 |  205.593958ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:05 | 200 |  126.821211ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:05 | 200 |  167.391458ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:05 | 200 |  469.677802ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:05 | 200 |  127.148157ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:06 | 200 |  168.197813ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:06 | 200 |  613.204189ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:06 | 200 |   87.346538ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:06 | 200 |  167.566321ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:07 | 200 |  455.200156ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:07 | 200 |   168.12777ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:07 | 200 |  166.665977ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:07 | 200 |  178.424922ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:08 | 200 |  169.158281ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:08 | 200 |  165.960777ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:08 | 200 |  708.932235ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:09 | 200 |  125.851304ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:09 | 200 |  166.364736ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:09 | 200 |  209.080447ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:09 | 200 |  166.822624ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:09 | 200 |  166.438521ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:10 | 200 |  734.511083ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:10 | 200 |   44.988457ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:10 | 200 |  126.362646ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:11 | 200 |  253.265742ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:11 | 200 |  124.044282ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:11 | 200 |  125.747611ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:11 | 200 |  220.209076ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:11 | 200 |  167.286234ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:11 | 200 |  167.117513ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:12 | 200 |  168.128894ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:12 | 200 |  126.392959ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:12 | 200 |  167.375091ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:12 | 200 |  187.859807ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:12 | 200 |   85.059234ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:12 | 200 |  165.927161ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:14 | 200 |  1.229156956s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:48:14 launchpad ollama[1754]: time=2025-04-13T14:48:14.226-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:48:14 launchpad ollama[1754]: time=2025-04-13T14:48:14.392-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.1 GiB"
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.061-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.4 GiB" free_swap="68.9 GiB"
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.062-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.063-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 35195"
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.063-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.063-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.063-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:48:15 launchpad ollama[119359]: INFO [main] build info | build=0 commit="unknown" tid="140598928834560" timestamp=1744580895
+Apr 13 14:48:15 launchpad ollama[119359]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140598928834560" timestamp=1744580895 total_threads=16
+Apr 13 14:48:15 launchpad ollama[119359]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35195" tid="140598928834560" timestamp=1744580895
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:48:15 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:48:15 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:48:15 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:48:15 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:48:15 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:48:15 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 14:48:15 launchpad ollama[1754]: time=2025-04-13T14:48:15.358-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:48:16 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 14:48:16 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:48:16 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 14:48:16 launchpad ollama[119359]: INFO [main] model loaded | tid="140598928834560" timestamp=1744580896
+Apr 13 14:48:16 launchpad ollama[1754]: time=2025-04-13T14:48:16.362-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:48:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:48:33 | 200 | 19.272204176s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:51:42 launchpad ollama[1754]: time=2025-04-13T14:51:42.475-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="657.8 MiB"
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.181-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9525985280 required="6.2 GiB"
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.181-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.4 GiB" free_swap="68.9 GiB"
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.181-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.182-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46131"
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.182-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.182-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.183-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:51:43 launchpad ollama[125244]: INFO [main] build info | build=0 commit="unknown" tid="139700466872320" timestamp=1744581103
+Apr 13 14:51:43 launchpad ollama[125244]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139700466872320" timestamp=1744581103 total_threads=16
+Apr 13 14:51:43 launchpad ollama[125244]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46131" tid="139700466872320" timestamp=1744581103
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:51:43 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:51:43 launchpad ollama[1754]: time=2025-04-13T14:51:43.434-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:51:43 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:51:43 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:51:43 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:51:43 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:51:43 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:51:44 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:51:44 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:51:44 launchpad ollama[125244]: INFO [main] model loaded | tid="139700466872320" timestamp=1744581104
+Apr 13 14:51:44 launchpad ollama[1754]: time=2025-04-13T14:51:44.437-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:51:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:44 | 200 |  2.327866018s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:44 | 200 |  155.836254ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:45 | 200 |  1.110089181s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:45 | 200 |   75.123658ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:46 | 200 |  156.597133ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:47 | 200 |  1.106637975s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:47 | 200 |   73.345248ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:47 | 200 |  155.373546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:48 | 200 |  774.105638ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:48 | 200 |   74.939363ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:48 | 200 |  155.301304ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:49 | 200 |   709.21574ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:49 | 200 |   75.734883ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:49 | 200 |  155.098412ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:50 | 200 |  704.067644ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:50 | 200 |   73.998753ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:50 | 200 |  154.389104ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:51 | 200 |    1.0316162s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:51 | 200 |    73.81543ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:51 | 200 |  157.721209ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:52 | 200 |   508.86493ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:52 | 200 |  114.279137ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:52 | 200 |  156.335117ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:52 | 200 |  295.390059ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:52 | 200 |  155.778386ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:53 | 200 |  155.608959ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:53 | 200 |  200.711905ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:53 | 200 |  155.383567ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:53 | 200 |  155.718212ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:54 | 200 |  418.141277ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:54 | 200 |   73.875119ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:54 | 200 |  155.409292ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:54 | 200 |  616.557785ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:55 | 200 |    74.17406ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:55 | 200 |  157.039781ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:55 | 200 |  453.787568ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:55 | 200 |  156.315774ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:55 | 200 |  157.406733ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:56 | 200 |  177.439879ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:56 | 200 |  158.188064ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:56 | 200 |  156.004753ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:57 | 200 |  709.210402ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:57 | 200 |  116.001729ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:57 | 200 |  156.643015ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:57 | 200 |  207.541271ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:57 | 200 |  157.971427ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:58 | 200 |   157.60671ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:58 | 200 |  736.071452ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:58 | 200 |    33.53547ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:58 | 200 |  158.695871ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:59 | 200 |  254.035204ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:59 | 200 |  115.467262ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:59 | 200 |   157.13492ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:59 | 200 |  224.019162ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:51:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:51:59 | 200 |  115.471842ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:00 | 200 |  156.829552ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:00 | 200 |  170.067603ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:00 | 200 |  115.655003ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:00 | 200 |  158.236461ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:00 | 200 |  229.998766ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:01 | 200 |  115.547989ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:01 | 200 |  155.503699ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:02 | 200 |  1.346159523s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 14:52:02 launchpad ollama[1754]: time=2025-04-13T14:52:02.734-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 14:52:02 launchpad ollama[1754]: time=2025-04-13T14:52:02.879-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.1 GiB"
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.548-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.5 GiB" free_swap="68.9 GiB"
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.549-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.550-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 40581"
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.550-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.550-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.550-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:52:03 launchpad ollama[125352]: INFO [main] build info | build=0 commit="unknown" tid="140122625363968" timestamp=1744581123
+Apr 13 14:52:03 launchpad ollama[125352]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140122625363968" timestamp=1744581123 total_threads=16
+Apr 13 14:52:03 launchpad ollama[125352]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40581" tid="140122625363968" timestamp=1744581123
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 14:52:03 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 14:52:03 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:52:03 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:52:03 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:52:03 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 14:52:03 launchpad ollama[1754]: time=2025-04-13T14:52:03.848-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 14:52:03 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:52:04 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 14:52:04 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 14:52:04 launchpad ollama[125352]: INFO [main] model loaded | tid="140122625363968" timestamp=1744581124
+Apr 13 14:52:04 launchpad ollama[1754]: time=2025-04-13T14:52:04.852-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 14:52:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:11 | 200 |  8.940236674s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.106-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="671.6 MiB"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.797-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9534439424 required="6.2 GiB"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.797-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.4 GiB" free_swap="68.9 GiB"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.797-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.798-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33843"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.798-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.798-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 14:52:12 launchpad ollama[1754]: time=2025-04-13T14:52:12.798-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 14:52:12 launchpad ollama[126368]: INFO [main] build info | build=0 commit="unknown" tid="139974700773376" timestamp=1744581132
+Apr 13 14:52:12 launchpad ollama[126368]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139974700773376" timestamp=1744581132 total_threads=16
+Apr 13 14:52:12 launchpad ollama[126368]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33843" tid="139974700773376" timestamp=1744581132
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 14:52:12 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 14:52:13 launchpad ollama[1754]: time=2025-04-13T14:52:13.049-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 14:52:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 14:52:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 14:52:13 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 14:52:13 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 14:52:13 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 14:52:13 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 14:52:13 launchpad ollama[126368]: INFO [main] model loaded | tid="139974700773376" timestamp=1744581133
+Apr 13 14:52:14 launchpad ollama[1754]: time=2025-04-13T14:52:14.053-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 14:52:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 14:52:16 | 200 |  4.487181651s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.371-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.526-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.526-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.527-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 39539"
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.527-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.527-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.527-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:20:25 launchpad ollama[139322]: INFO [main] build info | build=0 commit="unknown" tid="140461685592064" timestamp=1744582825
+Apr 13 15:20:25 launchpad ollama[139322]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140461685592064" timestamp=1744582825 total_threads=16
+Apr 13 15:20:25 launchpad ollama[139322]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39539" tid="140461685592064" timestamp=1744582825
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:20:25 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 15:20:25 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:20:25 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:20:25 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:20:25 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:20:25 launchpad ollama[1754]: time=2025-04-13T15:20:25.831-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 15:20:25 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:20:26 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:20:26 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 15:20:26 launchpad ollama[139322]: INFO [main] model loaded | tid="140461685592064" timestamp=1744582826
+Apr 13 15:20:26 launchpad ollama[1754]: time=2025-04-13T15:20:26.835-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 13 15:20:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:36 | 200 | 11.167640744s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:20:36 launchpad ollama[1754]: time=2025-04-13T15:20:36.703-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="695.0 MiB"
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.402-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9517006848 required="6.2 GiB"
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.402-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.402-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.403-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38901"
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.403-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.403-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.404-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:20:37 launchpad ollama[141884]: INFO [main] build info | build=0 commit="unknown" tid="139744251215872" timestamp=1744582837
+Apr 13 15:20:37 launchpad ollama[141884]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139744251215872" timestamp=1744582837 total_threads=16
+Apr 13 15:20:37 launchpad ollama[141884]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38901" tid="139744251215872" timestamp=1744582837
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 15:20:37 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:20:37 launchpad ollama[1754]: time=2025-04-13T15:20:37.655-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 15:20:37 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:20:37 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:20:37 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:20:37 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 15:20:37 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:20:38 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 15:20:38 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:20:38 launchpad ollama[141884]: INFO [main] model loaded | tid="139744251215872" timestamp=1744582838
+Apr 13 15:20:38 launchpad ollama[1754]: time=2025-04-13T15:20:38.658-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 15:20:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:38 | 200 |  2.335533744s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:39 | 200 |  219.083659ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:40 | 200 |  1.223402429s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:40 | 200 |  136.235196ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:40 | 200 |  218.879614ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:41 | 200 |  1.224914767s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:42 | 200 |  138.740362ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:42 | 200 |  220.989007ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:43 | 200 |  855.910944ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:43 | 200 |   138.90098ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:43 | 200 |  220.597509ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:44 | 200 |  1.151169528s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:44 | 200 |  136.198097ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:45 | 200 |  220.836503ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:45 | 200 |   750.97529ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:45 | 200 |  135.031813ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:46 | 200 |  218.368521ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:46 | 200 |  777.931599ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:47 | 200 |   97.067331ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:47 | 200 |  219.454273ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:47 | 200 |    470.3714ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:48 | 200 |  179.296821ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:48 | 200 |  218.896165ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:48 | 200 |  297.237775ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:48 | 200 |  217.391893ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:48 | 200 |  215.870248ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:49 | 200 |  202.543856ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:49 | 200 |  218.871805ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:49 | 200 |   224.20177ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:50 | 200 |  460.759715ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:50 | 200 |  137.360543ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:50 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:50 | 200 |   224.83623ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:51 | 200 |  975.741558ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:51 | 200 |  134.810334ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:51 | 200 |  218.520099ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:52 | 200 |  461.369368ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:52 | 200 |  177.507663ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:52 | 200 |  218.762699ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:52 | 200 |  179.885323ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:53 | 200 |  216.628303ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:53 | 200 |  223.107619ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:54 | 200 |  677.738395ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:54 | 200 |  137.562116ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:54 | 200 |  219.466324ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:54 | 200 |  211.088517ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:54 | 200 |  217.568166ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:55 | 200 |  218.191054ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:55 | 200 |  694.017096ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:55 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:55 | 200 |     178.036ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:56 | 200 |  221.769571ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:56 | 200 |  298.858248ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:56 | 200 |  221.898985ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:56 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:56 | 200 |  223.632845ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:57 | 200 |  263.110915ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:57 | 200 |   217.82677ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:57 | 200 |  217.428541ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:57 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:57 | 200 |  211.145909ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:58 | 200 |  218.083669ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:58 | 200 |  217.529936ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:58 | 200 |  231.392322ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:58 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:58 | 200 |  175.889072ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:20:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:20:59 | 200 |  217.337147ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:00 | 200 |  1.162428015s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:00 | 200 |  134.319078ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:00 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:00 | 200 |  217.600848ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:01 | 200 |  1.037685909s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:01 | 200 |  135.464764ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:02 | 200 |  218.163688ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:02 | 200 |  477.654323ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:02 | 200 |  139.213181ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:02 | 200 |  219.294195ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:04 | 200 |  1.233782221s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:21:04 launchpad ollama[1754]: time=2025-04-13T15:21:04.252-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:21:04 launchpad ollama[1754]: time=2025-04-13T15:21:04.402-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.1 GiB"
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.088-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.089-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.090-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 39235"
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.090-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.090-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.090-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:21:05 launchpad ollama[142039]: INFO [main] build info | build=0 commit="unknown" tid="139817837953024" timestamp=1744582865
+Apr 13 15:21:05 launchpad ollama[142039]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139817837953024" timestamp=1744582865 total_threads=16
+Apr 13 15:21:05 launchpad ollama[142039]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39235" tid="139817837953024" timestamp=1744582865
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:21:05 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 15:21:05 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:21:05 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:21:05 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:21:05 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:21:05 launchpad ollama[1754]: time=2025-04-13T15:21:05.388-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 15:21:05 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:21:06 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:21:06 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 15:21:06 launchpad ollama[142039]: INFO [main] model loaded | tid="139817837953024" timestamp=1744582866
+Apr 13 15:21:06 launchpad ollama[1754]: time=2025-04-13T15:21:06.391-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 15:21:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:21:18 | 200 | 13.944236364s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:24:05 launchpad ollama[1754]: time=2025-04-13T15:24:05.075-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:24:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:12 | 200 |  7.599873801s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:24:12 launchpad ollama[1754]: time=2025-04-13T15:24:12.842-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="628.9 MiB"
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.540-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9514844160 required="6.2 GiB"
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.540-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.540-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.542-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36671"
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.542-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.542-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.542-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:24:13 launchpad ollama[146893]: INFO [main] build info | build=0 commit="unknown" tid="140111561814016" timestamp=1744583053
+Apr 13 15:24:13 launchpad ollama[146893]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140111561814016" timestamp=1744583053 total_threads=16
+Apr 13 15:24:13 launchpad ollama[146893]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36671" tid="140111561814016" timestamp=1744583053
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 15:24:13 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:24:13 launchpad ollama[1754]: time=2025-04-13T15:24:13.794-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 15:24:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:24:13 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:24:13 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:24:13 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:24:13 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 15:24:14 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:24:14 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 15:24:14 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:24:14 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 15:24:14 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:24:14 launchpad ollama[146893]: INFO [main] model loaded | tid="140111561814016" timestamp=1744583054
+Apr 13 15:24:14 launchpad ollama[1754]: time=2025-04-13T15:24:14.797-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 15:24:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:14 | 200 |   2.30259646s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:15 | 200 |  177.778542ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:16 | 200 |  1.304098939s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:16 | 200 |   57.662815ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:16 | 200 |  182.232523ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:18 | 200 |  1.355317239s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:18 | 200 |   96.168717ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:18 | 200 |  179.389372ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:19 | 200 |  826.725606ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:19 | 200 |   99.253145ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:19 | 200 |  179.874021ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:20 | 200 |  995.533781ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:20 | 200 |   99.210141ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:20 | 200 |  180.905064ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:21 | 200 |  721.745916ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:21 | 200 |   99.566296ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:21 | 200 |  184.972369ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:22 | 200 |     907.063ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:22 | 200 |    97.61916ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:23 | 200 |  179.471147ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:23 | 200 |  523.254887ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:23 | 200 |   57.297338ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:23 | 200 |  139.995299ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:24 | 200 |  255.328335ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:24 | 200 |  176.788567ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:24 | 200 |  175.849215ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:24 | 200 |  198.014911ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:24 | 200 |  178.582029ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:24 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:24 | 200 |  137.825148ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:25 | 200 |  465.989827ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:25 | 200 |  137.951638ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:25 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:25 | 200 |  174.259795ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:26 | 200 |  615.923479ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:26 | 200 |   98.989301ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:26 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:26 | 200 |  180.735114ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:27 | 200 |  448.715728ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:27 | 200 |  176.590489ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:27 | 200 |  176.606538ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:27 | 200 |  175.912116ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:27 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:27 | 200 |  174.230091ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:28 | 200 |  176.343368ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:28 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:28 | 200 |  852.401871ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:29 | 200 |   92.890689ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:29 | 200 |  175.123809ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:29 | 200 |  204.803523ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:29 | 200 |  176.510692ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:29 | 200 |  178.906033ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:30 | 200 |  726.008503ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:30 | 200 |  135.877756ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:30 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:30 | 200 |  133.923684ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:31 | 200 |  282.130074ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:31 | 200 |  179.448196ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:31 | 200 |  180.601225ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:31 | 200 |  252.863629ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:31 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:31 | 200 |   173.38114ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:32 | 200 |  173.141291ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:32 | 200 |   202.71872ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:32 | 200 |  178.541055ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:32 | 200 |  176.393227ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:32 | 200 |  223.083898ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:33 | 200 |  133.447604ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:33 | 200 |  175.584425ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:34 | 200 |  1.485007028s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:34 | 200 |   92.675692ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:35 | 200 |  175.979136ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:36 | 200 |  929.812505ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:36 | 200 |   93.013605ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:36 | 200 |  174.103003ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:36 | 200 |  452.557148ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:36 | 200 |   97.380864ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:37 | 200 |  177.753677ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:38 | 200 |  1.250041522s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:24:38 launchpad ollama[1754]: time=2025-04-13T15:24:38.398-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:24:38 launchpad ollama[1754]: time=2025-04-13T15:24:38.537-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.1 GiB"
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.201-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.201-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.202-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 36773"
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.202-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.202-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.202-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:24:39 launchpad ollama[147052]: INFO [main] build info | build=0 commit="unknown" tid="140169570066432" timestamp=1744583079
+Apr 13 15:24:39 launchpad ollama[147052]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140169570066432" timestamp=1744583079 total_threads=16
+Apr 13 15:24:39 launchpad ollama[147052]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36773" tid="140169570066432" timestamp=1744583079
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:24:39 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 15:24:39 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:24:39 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:24:39 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:24:39 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:24:39 launchpad ollama[1754]: time=2025-04-13T15:24:39.504-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 15:24:39 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:24:40 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:24:40 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 15:24:40 launchpad ollama[147052]: INFO [main] model loaded | tid="140169570066432" timestamp=1744583080
+Apr 13 15:24:40 launchpad ollama[1754]: time=2025-04-13T15:24:40.507-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 13 15:24:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:24:52 | 200 | 14.585918113s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:28:16 launchpad ollama[1754]: time=2025-04-13T15:28:16.246-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:28:29 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:29 | 200 | 13.646273291s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.058-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="632.9 MiB"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.771-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9493807104 required="6.2 GiB"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.771-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.3 GiB" free_swap="68.9 GiB"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.772-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.772-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46661"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.773-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.773-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:28:30 launchpad ollama[1754]: time=2025-04-13T15:28:30.773-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:28:30 launchpad ollama[155592]: INFO [main] build info | build=0 commit="unknown" tid="139671107436544" timestamp=1744583310
+Apr 13 15:28:30 launchpad ollama[155592]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139671107436544" timestamp=1744583310 total_threads=16
+Apr 13 15:28:30 launchpad ollama[155592]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46661" tid="139671107436544" timestamp=1744583310
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 15:28:30 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:28:31 launchpad ollama[1754]: time=2025-04-13T15:28:31.024-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 15:28:31 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:28:31 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:28:31 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:28:31 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:28:31 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 15:28:31 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:28:31 launchpad ollama[155592]: INFO [main] model loaded | tid="139671107436544" timestamp=1744583311
+Apr 13 15:28:32 launchpad ollama[1754]: time=2025-04-13T15:28:32.028-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 13 15:28:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:32 | 200 |  2.309975384s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:32 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:32 | 200 |  175.763556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:33 | 200 |  1.046365842s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:33 | 200 |  100.228233ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:33 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:33 | 200 |  180.818243ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:34 | 200 |  1.048444731s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:34 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:34 | 200 |   93.478486ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:35 | 200 |  175.213153ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:35 | 200 |  781.520866ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:35 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:35 | 200 |   92.795907ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:36 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:36 | 200 |  174.127401ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:37 | 200 |  925.527678ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:37 | 200 |   92.992648ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:37 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:37 | 200 |  176.601691ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:38 | 200 |  717.241382ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:38 | 200 |    94.28505ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:38 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:38 | 200 |  175.718702ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:39 | 200 |  894.676217ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:39 | 200 |   93.103377ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:39 | 200 |  173.829834ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:40 | 200 |  512.827078ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:40 | 200 |  137.538919ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:40 | 200 |  176.199666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:40 | 200 |   295.36699ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:40 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:40 | 200 |  176.711206ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:41 | 200 |  175.252962ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:41 | 200 |  200.886685ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:41 | 200 |  174.826479ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:41 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:41 | 200 |  174.554458ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:42 | 200 |   427.65367ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:42 | 200 |   93.555176ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:42 | 200 |  174.998882ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:42 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:42 | 200 |  623.251533ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:43 | 200 |   94.898177ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:43 | 200 |  174.574368ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:43 | 200 |  454.208739ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:43 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:43 | 200 |  175.775594ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:44 | 200 |  177.891789ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:44 | 200 |  178.793725ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:44 | 200 |  174.754393ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:44 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:44 | 200 |  173.874997ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:45 | 200 |  717.899963ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:45 | 200 |   93.361893ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:45 | 200 |   175.09635ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:45 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:45 | 200 |  208.630502ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:46 | 200 |  174.736263ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:46 | 200 |  175.305363ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:46 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:46 | 200 |  704.026421ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:47 | 200 |  134.881955ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:47 | 200 |   175.89158ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:47 | 200 |   296.27863ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:47 | 200 |  176.761058ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:47 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:47 | 200 |  175.372341ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:48 | 200 |  262.516666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:48 | 200 |  175.024969ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:48 | 200 |  174.266167ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:48 | 200 |  210.161869ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:48 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:48 | 200 |  174.971625ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:49 | 200 |  176.472332ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:49 | 200 |  230.807281ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:49 | 200 |  134.063488ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:49 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:49 | 200 |  174.437775ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:51 | 200 |  1.557817623s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:51 | 200 |   95.108878ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:51 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:51 | 200 |  135.748623ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:52 | 200 |  1.093852194s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:52 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:52 | 200 |   94.012566ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:53 | 200 |  176.254603ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:53 | 200 |  424.294465ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:53 | 200 |   94.378392ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:53 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:53 | 200 |  175.750028ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:54 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:28:54 | 200 |  1.052018571s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:28:54 launchpad ollama[1754]: time=2025-04-13T15:28:54.889-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.034-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.1 GiB"
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.701-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.3 GiB" free_swap="68.9 GiB"
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.701-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.702-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 46311"
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.702-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.702-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:28:55 launchpad ollama[1754]: time=2025-04-13T15:28:55.702-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:28:55 launchpad ollama[155704]: INFO [main] build info | build=0 commit="unknown" tid="139702568947712" timestamp=1744583335
+Apr 13 15:28:55 launchpad ollama[155704]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139702568947712" timestamp=1744583335 total_threads=16
+Apr 13 15:28:55 launchpad ollama[155704]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46311" tid="139702568947712" timestamp=1744583335
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:28:55 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 15:28:55 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:28:55 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:28:55 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:28:55 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:28:55 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: time=2025-04-13T15:28:56.001-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:28:56 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 15:28:56 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 15:28:56 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:28:56 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:28:56 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 15:28:56 launchpad ollama[155704]: INFO [main] model loaded | tid="139702568947712" timestamp=1744583336
+Apr 13 15:28:57 launchpad ollama[1754]: time=2025-04-13T15:28:57.005-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 15:29:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:29:10 | 200 | 15.528294021s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:30:47 launchpad ollama[1754]: time=2025-04-13T15:30:47.444-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:30:59 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:30:59 | 200 | 12.251677743s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:30:59 launchpad ollama[1754]: time=2025-04-13T15:30:59.866-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="645.2 MiB"
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.562-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9517268992 required="6.2 GiB"
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.562-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.3 GiB" free_swap="68.9 GiB"
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.562-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.564-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44167"
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.564-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.564-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.564-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:31:00 launchpad ollama[162171]: INFO [main] build info | build=0 commit="unknown" tid="140716815843328" timestamp=1744583460
+Apr 13 15:31:00 launchpad ollama[162171]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140716815843328" timestamp=1744583460 total_threads=16
+Apr 13 15:31:00 launchpad ollama[162171]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44167" tid="140716815843328" timestamp=1744583460
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 15:31:00 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:31:00 launchpad ollama[1754]: time=2025-04-13T15:31:00.815-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 256
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: vocab type       = BPE
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 280147
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 4096
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 32
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_head           = 32
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 4
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 14336
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: model type       = 8B
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_print_meta: max token length = 256
+Apr 13 15:31:00 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:31:00 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:31:00 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:31:00 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:31:00 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 15:31:01 launchpad ollama[1754]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:31:01 launchpad ollama[1754]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 15:31:01 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:31:01 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 15:31:01 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:31:01 launchpad ollama[162171]: INFO [main] model loaded | tid="140716815843328" timestamp=1744583461
+Apr 13 15:31:01 launchpad ollama[1754]: time=2025-04-13T15:31:01.818-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 15:31:01 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:01 | 200 |     2.295382s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:02 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:02 | 200 |   172.82425ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:03 | 200 |  1.023615991s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:03 | 200 |   92.011696ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:03 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:03 | 200 |  172.003511ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:04 | 200 |   1.01405703s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:04 | 200 |    93.34209ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:04 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:04 | 200 |  172.470598ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:05 | 200 |  827.630636ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:05 | 200 |   90.832012ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:05 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:05 | 200 |  171.703666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:06 | 200 |  977.696344ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:06 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:06 | 200 |   90.990238ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:07 | 200 |  171.661044ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:07 | 200 |  701.324264ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:07 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:07 | 200 |   90.880075ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:08 | 200 |  171.233842ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:08 | 200 |   725.87934ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:08 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:08 | 200 |   90.319721ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:09 | 200 |  171.049921ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:09 | 200 |  510.984656ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:09 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:09 | 200 |  132.815278ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:10 | 200 |  171.691165ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:10 | 200 |   293.35695ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:10 | 200 |  174.095509ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:10 | 200 |  172.078875ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:10 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:10 | 200 |  200.772509ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:11 | 200 |  172.798378ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:11 | 200 |  171.343274ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:11 | 200 |  435.532499ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:11 | 200 |  131.438092ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:11 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:11 | 200 |  175.380635ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:12 | 200 |  615.994352ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:12 | 200 |   91.321108ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:12 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:12 | 200 |  171.567546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:13 | 200 |  493.008798ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:13 | 200 |  132.712482ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:13 | 200 |  130.858442ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:13 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:13 | 200 |  178.975808ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:14 | 200 |  171.302138ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:14 | 200 |  171.519109ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:14 | 200 |  676.712644ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:14 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:14 | 200 |   89.402666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:15 | 200 |  171.906299ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:15 | 200 |  207.864117ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:15 | 200 |  172.189789ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:15 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:15 | 200 |  171.544534ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:16 | 200 |  735.047485ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:16 | 200 |  130.804605ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:16 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:16 | 200 |   171.68224ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:17 | 200 |  292.339542ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:17 | 200 |  171.780638ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:17 | 200 |  171.663376ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:17 | 200 |  265.501974ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:17 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:17 | 200 |  174.116283ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:18 | 200 |  172.011446ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:18 | 200 |  209.780381ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:18 | 200 |  172.116588ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:18 | 200 |  171.302487ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:18 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:18 | 200 |  228.867021ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:19 | 200 |  130.427285ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:19 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:19 | 200 |  172.539627ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:20 | 200 |   939.93145ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:20 | 200 |   90.696951ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:20 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:20 | 200 |  172.931688ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:21 | 200 |  1.002579344s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:21 | 200 |   91.129161ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:21 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:21 | 200 |  171.674444ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:22 | 200 |  460.472677ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:22 | 200 |     90.9708ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:22 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:22 | 200 |  172.491782ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:23 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:23 | 200 |  1.015437557s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:31:23 launchpad ollama[1754]: time=2025-04-13T15:31:23.688-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:31:23 launchpad ollama[1754]: time=2025-04-13T15:31:23.836-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.1 GiB"
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.506-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.3 GiB" free_swap="68.9 GiB"
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.506-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=38 layers.split="" memory.available="[8.9 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.7 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.7 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.507-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama1506917716/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 38 --parallel 1 --port 33047"
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.507-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.507-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.507-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:31:24 launchpad ollama[162284]: INFO [main] build info | build=0 commit="unknown" tid="139816642605056" timestamp=1744583484
+Apr 13 15:31:24 launchpad ollama[162284]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139816642605056" timestamp=1744583484 total_threads=16
+Apr 13 15:31:24 launchpad ollama[162284]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33047" tid="139816642605056" timestamp=1744583484
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:31:24 launchpad ollama[1754]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: arch             = llama
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_head           = 40
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: rope type        = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: model type       = 13B
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_print_meta: max token length = 48
+Apr 13 15:31:24 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:31:24 launchpad ollama[1754]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:31:24 launchpad ollama[1754]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:31:24 launchpad ollama[1754]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:31:24 launchpad ollama[1754]: time=2025-04-13T15:31:24.806-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_tensors: offloading 38 repeating layers to GPU
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_tensors: offloaded 38/41 layers to GPU
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 13 15:31:24 launchpad ollama[1754]: llm_load_tensors:      CUDA0 buffer size =  6467.42 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:31:25 launchpad ollama[1754]: llama_kv_cache_init:  CUDA_Host KV buffer size =    80.00 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_kv_cache_init:      CUDA0 KV buffer size =  1520.00 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:31:25 launchpad ollama[1754]: llama_new_context_with_model: graph splits = 26
+Apr 13 15:31:25 launchpad ollama[162284]: INFO [main] model loaded | tid="139816642605056" timestamp=1744583485
+Apr 13 15:31:25 launchpad ollama[1754]: time=2025-04-13T15:31:25.809-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 15:31:39 launchpad ollama[1754]: [GIN] 2025/04/13 - 15:31:39 | 200 | 16.091503946s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:37:14 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 15:37:14 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 15:37:14 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 15:37:14 launchpad systemd[1]: ollama.service: Consumed 56min 8.863s CPU time, 12.6G memory peak, 11.4G read from disk, 508.1M written to disk, 19M incoming IP traffic, 30.2M outgoing IP traffic.
+-- Boot a78f12391ecd4729a571ed50e2e04cee --
+Apr 13 15:37:56 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 15:37:56 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 15:37:56 launchpad ollama[1756]: 2025/04/13 15:37:56 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 15:37:56 launchpad ollama[1756]: time=2025-04-13T15:37:56.480-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 15:37:56 launchpad ollama[1756]: time=2025-04-13T15:37:56.490-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 15:37:56 launchpad ollama[1756]: time=2025-04-13T15:37:56.491-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 15:37:56 launchpad ollama[1756]: time=2025-04-13T15:37:56.492-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3624026559/runners
+Apr 13 15:37:59 launchpad ollama[1756]: time=2025-04-13T15:37:59.472-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 13 15:37:59 launchpad ollama[1756]: time=2025-04-13T15:37:59.473-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 15:37:59 launchpad ollama[1756]: time=2025-04-13T15:37:59.473-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 15:37:59 launchpad ollama[1756]: time=2025-04-13T15:37:59.474-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 15:37:59 launchpad ollama[1756]: time=2025-04-13T15:37:59.474-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 15:37:59 launchpad ollama[1756]: time=2025-04-13T15:37:59.713-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 15:39:10 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:39:10 | 200 |    3.758748ms |       127.0.0.1 | GET      "/api/tags"
+Apr 13 15:39:11 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:39:11 | 200 |     543.705µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 15:39:11 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:39:11 | 200 |     767.313µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 15:39:18 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:39:18 | 200 |      36.682µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.911-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10312744960 required="6.2 GiB"
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.911-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.4 GiB" free_swap="68.9 GiB"
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.911-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.913-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3624026559/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38011"
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.913-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.913-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:41:58 launchpad ollama[1756]: time=2025-04-13T15:41:58.914-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:41:59 launchpad ollama[5628]: INFO [main] build info | build=0 commit="unknown" tid="140634880040960" timestamp=1744584119
+Apr 13 15:41:59 launchpad ollama[5628]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140634880040960" timestamp=1744584119 total_threads=16
+Apr 13 15:41:59 launchpad ollama[5628]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38011" tid="140634880040960" timestamp=1744584119
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 15:41:59 launchpad ollama[1756]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:41:59 launchpad ollama[1756]: time=2025-04-13T15:41:59.165-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_vocab: special tokens cache size = 256
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: arch             = llama
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: vocab type       = BPE
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_merges         = 280147
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_embd           = 4096
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_layer          = 32
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_head           = 32
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_gqa            = 4
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_ff             = 14336
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: rope type        = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: model type       = 8B
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_print_meta: max token length = 256
+Apr 13 15:41:59 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:41:59 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:41:59 launchpad ollama[1756]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:41:59 launchpad ollama[1756]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:41:59 launchpad ollama[1756]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 15:42:04 launchpad ollama[1756]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 15:42:04 launchpad ollama[1756]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:42:04 launchpad ollama[1756]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 15:42:04 launchpad ollama[1756]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 15:42:04 launchpad ollama[1756]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:42:05 launchpad ollama[1756]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 15:42:05 launchpad ollama[1756]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:42:05 launchpad ollama[5628]: INFO [main] model loaded | tid="140634880040960" timestamp=1744584125
+Apr 13 15:42:05 launchpad ollama[1756]: time=2025-04-13T15:42:05.435-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.52 seconds"
+Apr 13 15:42:05 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:42:05 | 200 |  6.885074904s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:42:11 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:42:11 | 200 |  195.666683ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:42:12 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:42:12 | 200 |  1.342602523s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:42:12 launchpad ollama[1756]: time=2025-04-13T15:42:12.609-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:42:12 launchpad ollama[1756]: time=2025-04-13T15:42:12.754-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.9 GiB"
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.454-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10312744960 required="9.2 GiB"
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.454-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="58.0 GiB" free_swap="68.9 GiB"
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.454-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.455-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3624026559/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 38247"
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.455-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.455-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.456-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:42:13 launchpad ollama[5741]: INFO [main] build info | build=0 commit="unknown" tid="140651021824000" timestamp=1744584133
+Apr 13 15:42:13 launchpad ollama[5741]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140651021824000" timestamp=1744584133 total_threads=16
+Apr 13 15:42:13 launchpad ollama[5741]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38247" tid="140651021824000" timestamp=1744584133
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:42:13 launchpad ollama[1756]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: arch             = llama
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_head           = 40
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: rope type        = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: model type       = 13B
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_print_meta: max token length = 48
+Apr 13 15:42:13 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:42:13 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:42:13 launchpad ollama[1756]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:42:13 launchpad ollama[1756]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:42:13 launchpad ollama[1756]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:42:13 launchpad ollama[1756]: time=2025-04-13T15:42:13.706-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:42:21 launchpad ollama[1756]: llm_load_tensors: offloading 40 repeating layers to GPU
+Apr 13 15:42:21 launchpad ollama[1756]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:42:21 launchpad ollama[1756]: llm_load_tensors: offloaded 41/41 layers to GPU
+Apr 13 15:42:21 launchpad ollama[1756]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Apr 13 15:42:21 launchpad ollama[1756]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:42:22 launchpad ollama[1756]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:42:22 launchpad ollama[1756]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:42:22 launchpad ollama[5741]: INFO [main] model loaded | tid="140651021824000" timestamp=1744584142
+Apr 13 15:42:22 launchpad ollama[1756]: time=2025-04-13T15:42:22.231-07:00 level=INFO source=server.go:626 msg="llama runner started in 8.78 seconds"
+Apr 13 15:42:37 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:42:37 | 200 | 24.657033543s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:42:37 launchpad ollama[1756]: time=2025-04-13T15:42:37.331-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:42:38 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:42:38 | 200 |  1.426392962s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:42:38 launchpad ollama[1756]: time=2025-04-13T15:42:38.799-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:42:43 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:42:43 | 200 |  5.189497678s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:45:50 launchpad ollama[1756]: time=2025-04-13T15:45:50.385-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="905.0 MiB"
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.068-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=10316021760 required="6.2 GiB"
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.068-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.069-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.069-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3624026559/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45013"
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.070-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.070-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.070-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:45:51 launchpad ollama[6495]: INFO [main] build info | build=0 commit="unknown" tid="139679094628352" timestamp=1744584351
+Apr 13 15:45:51 launchpad ollama[6495]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139679094628352" timestamp=1744584351 total_threads=16
+Apr 13 15:45:51 launchpad ollama[6495]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45013" tid="139679094628352" timestamp=1744584351
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - type  f32:   65 tensors
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - type q4_0:  225 tensors
+Apr 13 15:45:51 launchpad ollama[1756]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:45:51 launchpad ollama[1756]: time=2025-04-13T15:45:51.321-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_vocab: special tokens cache size = 256
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: arch             = llama
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: vocab type       = BPE
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_vocab          = 128256
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_merges         = 280147
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_embd           = 4096
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_layer          = 32
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_head           = 32
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_head_kv        = 8
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_gqa            = 4
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_ff             = 14336
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: rope type        = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: model type       = 8B
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: model params     = 8.03 B
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_print_meta: max token length = 256
+Apr 13 15:45:51 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:45:51 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:45:51 launchpad ollama[1756]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:45:51 launchpad ollama[1756]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 13 15:45:51 launchpad ollama[1756]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: n_ctx      = 8192
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:45:52 launchpad ollama[1756]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: graph nodes  = 1030
+Apr 13 15:45:52 launchpad ollama[1756]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:45:52 launchpad ollama[6495]: INFO [main] model loaded | tid="139679094628352" timestamp=1744584352
+Apr 13 15:45:52 launchpad ollama[1756]: time=2025-04-13T15:45:52.324-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 13 15:45:52 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:45:52 | 200 |   2.29403766s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:45:52 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:45:52 | 200 |  175.115602ms |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:45:54 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:45:54 | 200 |  1.481339439s |       127.0.0.1 | POST     "/api/embed"
+Apr 13 15:45:54 launchpad ollama[1756]: time=2025-04-13T15:45:54.202-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 13 15:45:54 launchpad ollama[1756]: time=2025-04-13T15:45:54.343-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="3.9 GiB"
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.041-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=10316021760 required="9.2 GiB"
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.042-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="57.8 GiB" free_swap="68.9 GiB"
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.042-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[9.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="9.2 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[9.2 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.043-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama3624026559/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 41 --parallel 1 --port 41581"
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.043-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.043-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.044-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 13 15:45:55 launchpad ollama[6524]: INFO [main] build info | build=0 commit="unknown" tid="140406867972096" timestamp=1744584355
+Apr 13 15:45:55 launchpad ollama[6524]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140406867972096" timestamp=1744584355 total_threads=16
+Apr 13 15:45:55 launchpad ollama[6524]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41581" tid="140406867972096" timestamp=1744584355
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - type  f32:   81 tensors
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - type q4_0:  281 tensors
+Apr 13 15:45:55 launchpad ollama[1756]: llama_model_loader: - type q6_K:    1 tensors
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_vocab: special tokens cache size = 3
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: format           = GGUF V2
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: arch             = llama
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: vocab type       = SPM
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_vocab          = 32016
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_merges         = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: vocab_only       = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_embd           = 5120
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_layer          = 40
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_head           = 40
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_head_kv        = 40
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_rot            = 128
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_swa            = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_gqa            = 1
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_ff             = 13824
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_expert         = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_expert_used    = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: causal attn      = 1
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: pooling type     = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: rope type        = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: rope scaling     = linear
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: freq_scale_train = 1
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: ssm_d_state      = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: model type       = 13B
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: model ftype      = Q4_0
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: model params     = 13.02 B
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: general.name     = codellama
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: BOS token        = 1 ''
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: EOS token        = 2 ''
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: UNK token        = 0 ''
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_print_meta: max token length = 48
+Apr 13 15:45:55 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 13 15:45:55 launchpad ollama[1756]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 13 15:45:55 launchpad ollama[1756]: ggml_cuda_init: found 1 CUDA devices:
+Apr 13 15:45:55 launchpad ollama[1756]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 13 15:45:55 launchpad ollama[1756]: time=2025-04-13T15:45:55.341-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_tensors: offloading 40 repeating layers to GPU
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_tensors: offloaded 41/41 layers to GPU
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_tensors:        CPU buffer size =    87.93 MiB
+Apr 13 15:45:55 launchpad ollama[1756]: llm_load_tensors:      CUDA0 buffer size =  6936.07 MiB
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: n_ctx      = 2048
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: n_batch    = 512
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: n_ubatch   = 512
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: flash_attn = 0
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: freq_scale = 1
+Apr 13 15:45:56 launchpad ollama[1756]: llama_kv_cache_init:      CUDA0 KV buffer size =  1600.00 MiB
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model:      CUDA0 compute buffer size =   204.00 MiB
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model:  CUDA_Host compute buffer size =    14.01 MiB
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: graph nodes  = 1286
+Apr 13 15:45:56 launchpad ollama[1756]: llama_new_context_with_model: graph splits = 2
+Apr 13 15:45:56 launchpad ollama[6524]: INFO [main] model loaded | tid="140406867972096" timestamp=1744584356
+Apr 13 15:45:56 launchpad ollama[1756]: time=2025-04-13T15:45:56.344-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 13 15:46:01 launchpad ollama[1756]: [GIN] 2025/04/13 - 15:46:01 | 200 |  7.408036594s |       127.0.0.1 | POST     "/api/chat"
+Apr 13 15:55:41 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 15:55:41 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 15:55:41 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 15:55:41 launchpad systemd[1]: ollama.service: Consumed 41.005s CPU time, 12.4G memory peak, 11.4G read from disk, 508.1M written to disk.
+-- Boot 20128fdcbaa2418e9d2aceed164d3c06 --
+Apr 13 15:56:22 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 15:56:22 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 15:56:22 launchpad ollama[1760]: 2025/04/13 15:56:22 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 15:56:22 launchpad ollama[1760]: time=2025-04-13T15:56:22.165-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 15:56:22 launchpad ollama[1760]: time=2025-04-13T15:56:22.175-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 15:56:22 launchpad ollama[1760]: time=2025-04-13T15:56:22.176-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 15:56:22 launchpad ollama[1760]: time=2025-04-13T15:56:22.177-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1080162874/runners
+Apr 13 15:56:25 launchpad ollama[1760]: time=2025-04-13T15:56:25.347-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 13 15:56:25 launchpad ollama[1760]: time=2025-04-13T15:56:25.348-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 15:56:25 launchpad ollama[1760]: time=2025-04-13T15:56:25.348-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 15:56:25 launchpad ollama[1760]: time=2025-04-13T15:56:25.349-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 15:56:25 launchpad ollama[1760]: time=2025-04-13T15:56:25.349-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 15:56:25 launchpad ollama[1760]: time=2025-04-13T15:56:25.567-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 16:00:34 launchpad ollama[1760]: [GIN] 2025/04/13 - 16:00:34 | 200 |    3.583045ms |       127.0.0.1 | GET      "/api/tags"
+Apr 13 16:00:34 launchpad ollama[1760]: [GIN] 2025/04/13 - 16:00:34 | 200 |     658.458µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 16:00:34 launchpad ollama[1760]: [GIN] 2025/04/13 - 16:00:34 | 200 |     723.653µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 16:16:01 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 16:16:01 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 16:16:01 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 16:16:01 launchpad systemd[1]: ollama.service: Consumed 3.644s CPU time, 786.5M memory peak, 234.4M read from disk, 508.1M written to disk, 1.3K incoming IP traffic, 6.2K outgoing IP traffic.
+-- Boot b8bbb9ea00c44553bbb55609f6f047a8 --
+Apr 13 16:16:40 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 16:16:40 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 16:16:41 launchpad ollama[1764]: 2025/04/13 16:16:41 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 16:16:41 launchpad ollama[1764]: time=2025-04-13T16:16:41.045-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 16:16:41 launchpad ollama[1764]: time=2025-04-13T16:16:41.055-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 16:16:41 launchpad ollama[1764]: time=2025-04-13T16:16:41.056-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 16:16:41 launchpad ollama[1764]: time=2025-04-13T16:16:41.057-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama750748247/runners
+Apr 13 16:16:44 launchpad ollama[1764]: time=2025-04-13T16:16:44.027-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 13 16:16:44 launchpad ollama[1764]: time=2025-04-13T16:16:44.028-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 16:16:44 launchpad ollama[1764]: time=2025-04-13T16:16:44.028-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:16:44 launchpad ollama[1764]: time=2025-04-13T16:16:44.029-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:16:44 launchpad ollama[1764]: time=2025-04-13T16:16:44.029-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:16:44 launchpad ollama[1764]: time=2025-04-13T16:16:44.261-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 16:30:08 launchpad ollama[1764]: [GIN] 2025/04/13 - 16:30:08 | 200 |    3.547667ms |       127.0.0.1 | GET      "/api/tags"
+Apr 13 16:30:10 launchpad ollama[1764]: [GIN] 2025/04/13 - 16:30:10 | 200 |      688.84µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 16:30:10 launchpad ollama[1764]: [GIN] 2025/04/13 - 16:30:10 | 200 |     748.102µs |       127.0.0.1 | GET      "/api/version"
+Apr 13 16:51:31 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 16:51:31 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 16:51:31 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 16:51:31 launchpad systemd[1]: ollama.service: Consumed 3.450s CPU time, 786.7M memory peak, 234.4M read from disk, 508.1M written to disk, 1.3K incoming IP traffic, 6.2K outgoing IP traffic.
+-- Boot dee112a04b9e4f0a8d0dead2f8cfb4fc --
+Apr 13 16:52:12 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 16:52:12 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 16:52:12 launchpad ollama[1762]: 2025/04/13 16:52:12 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 16:52:12 launchpad ollama[1762]: time=2025-04-13T16:52:12.182-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 16:52:12 launchpad ollama[1762]: time=2025-04-13T16:52:12.193-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 16:52:12 launchpad ollama[1762]: time=2025-04-13T16:52:12.194-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 16:52:12 launchpad ollama[1762]: time=2025-04-13T16:52:12.196-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1951053274/runners
+Apr 13 16:52:15 launchpad ollama[1762]: time=2025-04-13T16:52:15.176-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Apr 13 16:52:15 launchpad ollama[1762]: time=2025-04-13T16:52:15.178-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 16:52:15 launchpad ollama[1762]: time=2025-04-13T16:52:15.178-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:52:15 launchpad ollama[1762]: time=2025-04-13T16:52:15.178-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:52:15 launchpad ollama[1762]: time=2025-04-13T16:52:15.178-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:52:15 launchpad ollama[1762]: time=2025-04-13T16:52:15.405-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 16:54:47 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 16:54:47 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 16:54:47 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 16:54:47 launchpad systemd[1]: ollama.service: Consumed 3.423s CPU time, 787.1M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 43ecbe2f41fb4b8da3b592399ebf60e2 --
+Apr 13 16:55:26 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 16:55:27 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 16:55:27 launchpad ollama[1758]: 2025/04/13 16:55:27 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 16:55:27 launchpad ollama[1758]: time=2025-04-13T16:55:27.146-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 16:55:27 launchpad ollama[1758]: time=2025-04-13T16:55:27.155-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 16:55:27 launchpad ollama[1758]: time=2025-04-13T16:55:27.156-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 16:55:27 launchpad ollama[1758]: time=2025-04-13T16:55:27.158-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama841105555/runners
+Apr 13 16:55:30 launchpad ollama[1758]: time=2025-04-13T16:55:30.129-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 13 16:55:30 launchpad ollama[1758]: time=2025-04-13T16:55:30.130-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 16:55:30 launchpad ollama[1758]: time=2025-04-13T16:55:30.130-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:55:30 launchpad ollama[1758]: time=2025-04-13T16:55:30.131-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:55:30 launchpad ollama[1758]: time=2025-04-13T16:55:30.131-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 16:55:30 launchpad ollama[1758]: time=2025-04-13T16:55:30.358-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 17:28:51 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 17:28:51 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 17:28:51 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 17:28:51 launchpad systemd[1]: ollama.service: Consumed 3.439s CPU time, 786.8M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 5268246c82d140dd8cf2d9bfdbe01090 --
+Apr 13 17:29:45 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 17:29:45 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 17:29:45 launchpad ollama[1771]: 2025/04/13 17:29:45 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 17:29:45 launchpad ollama[1771]: time=2025-04-13T17:29:45.525-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 17:29:45 launchpad ollama[1771]: time=2025-04-13T17:29:45.536-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 17:29:45 launchpad ollama[1771]: time=2025-04-13T17:29:45.537-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 17:29:45 launchpad ollama[1771]: time=2025-04-13T17:29:45.539-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama781222968/runners
+Apr 13 17:29:48 launchpad ollama[1771]: time=2025-04-13T17:29:48.465-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 13 17:29:48 launchpad ollama[1771]: time=2025-04-13T17:29:48.467-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 17:29:48 launchpad ollama[1771]: time=2025-04-13T17:29:48.467-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 17:29:48 launchpad ollama[1771]: time=2025-04-13T17:29:48.467-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 17:29:48 launchpad ollama[1771]: time=2025-04-13T17:29:48.467-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 17:29:50 launchpad ollama[1771]: time=2025-04-13T17:29:50.235-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 13 17:30:26 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 13 17:30:26 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 13 17:30:26 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 13 17:30:26 launchpad systemd[1]: ollama.service: Consumed 5.191s CPU time, 789.7M memory peak, 237.4M read from disk, 508.1M written to disk.
+-- Boot e268674044bb476e80f6a4fbb0f4cc1b --
+Apr 13 17:31:07 launchpad systemd[1]: Starting Server for local large language models...
+Apr 13 17:31:07 launchpad systemd[1]: Started Server for local large language models.
+Apr 13 17:31:07 launchpad ollama[1756]: 2025/04/13 17:31:07 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 13 17:31:07 launchpad ollama[1756]: time=2025-04-13T17:31:07.517-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 13 17:31:07 launchpad ollama[1756]: time=2025-04-13T17:31:07.526-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 13 17:31:07 launchpad ollama[1756]: time=2025-04-13T17:31:07.527-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 13 17:31:07 launchpad ollama[1756]: time=2025-04-13T17:31:07.529-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama621053729/runners
+Apr 13 17:31:10 launchpad ollama[1756]: time=2025-04-13T17:31:10.579-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 13 17:31:10 launchpad ollama[1756]: time=2025-04-13T17:31:10.580-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 13 17:31:10 launchpad ollama[1756]: time=2025-04-13T17:31:10.581-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 17:31:10 launchpad ollama[1756]: time=2025-04-13T17:31:10.581-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 17:31:10 launchpad ollama[1756]: time=2025-04-13T17:31:10.581-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 13 17:31:10 launchpad ollama[1756]: time=2025-04-13T17:31:10.812-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 13 18:03:15 launchpad ollama[1756]: [GIN] 2025/04/13 - 18:03:15 | 200 |    3.668785ms |       127.0.0.1 | GET      "/api/tags"
+Apr 13 18:03:16 launchpad ollama[1756]: [GIN] 2025/04/13 - 18:03:16 | 200 |     598.154µs |       127.0.0.1 | GET      "/api/tags"
+Apr 13 18:03:16 launchpad ollama[1756]: [GIN] 2025/04/13 - 18:03:16 | 200 |     611.121µs |       127.0.0.1 | GET      "/api/version"
+Apr 15 14:20:41 launchpad ollama[1756]: [GIN] 2025/04/15 - 14:20:41 | 200 |     687.345µs |       127.0.0.1 | GET      "/api/tags"
+Apr 15 14:20:41 launchpad ollama[1756]: [GIN] 2025/04/15 - 14:20:41 | 200 |      49.147µs |       127.0.0.1 | GET      "/api/version"
+Apr 15 15:36:17 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 15:36:17 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 15:36:17 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 15:36:17 launchpad systemd[1]: ollama.service: Consumed 4.198s CPU time, 786.7M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 431c14e51fac4578a0e4f6280b7e5e30 --
+Apr 15 15:37:06 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 15:37:06 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 15:37:06 launchpad ollama[1752]: 2025/04/15 15:37:06 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 15:37:06 launchpad ollama[1752]: time=2025-04-15T15:37:06.397-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 15:37:06 launchpad ollama[1752]: time=2025-04-15T15:37:06.408-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 15:37:06 launchpad ollama[1752]: time=2025-04-15T15:37:06.409-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 15:37:06 launchpad ollama[1752]: time=2025-04-15T15:37:06.411-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3612536661/runners
+Apr 15 15:37:09 launchpad ollama[1752]: time=2025-04-15T15:37:09.397-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 15:37:09 launchpad ollama[1752]: time=2025-04-15T15:37:09.397-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 15:37:09 launchpad ollama[1752]: time=2025-04-15T15:37:09.398-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:37:09 launchpad ollama[1752]: time=2025-04-15T15:37:09.398-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:37:09 launchpad ollama[1752]: time=2025-04-15T15:37:09.398-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:37:17 launchpad ollama[1752]: time=2025-04-15T15:37:17.499-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 15 15:37:38 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 15:37:39 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 15:37:39 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 15:37:39 launchpad systemd[1]: ollama.service: Consumed 11.360s CPU time, 786.4M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot 25edadc520564711bb6293d77d6db8fe --
+Apr 15 15:38:19 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 15:38:19 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 15:38:19 launchpad ollama[1756]: 2025/04/15 15:38:19 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 15:38:19 launchpad ollama[1756]: time=2025-04-15T15:38:19.551-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 15:38:19 launchpad ollama[1756]: time=2025-04-15T15:38:19.559-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 15:38:19 launchpad ollama[1756]: time=2025-04-15T15:38:19.561-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 15:38:19 launchpad ollama[1756]: time=2025-04-15T15:38:19.562-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama251922655/runners
+Apr 15 15:38:22 launchpad ollama[1756]: time=2025-04-15T15:38:22.543-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 15:38:22 launchpad ollama[1756]: time=2025-04-15T15:38:22.543-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 15:38:22 launchpad ollama[1756]: time=2025-04-15T15:38:22.544-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:38:22 launchpad ollama[1756]: time=2025-04-15T15:38:22.544-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:38:22 launchpad ollama[1756]: time=2025-04-15T15:38:22.544-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:38:22 launchpad ollama[1756]: time=2025-04-15T15:38:22.775-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 15:56:27 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 15:56:28 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 15:56:28 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 15:56:28 launchpad systemd[1]: ollama.service: Consumed 3.445s CPU time, 786.9M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot f683eeafa22e40de8bb52c8456b74fc9 --
+Apr 15 15:58:28 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 15:58:28 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 15:58:28 launchpad ollama[1758]: 2025/04/15 15:58:28 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 15:58:28 launchpad ollama[1758]: time=2025-04-15T15:58:28.980-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 15:58:28 launchpad ollama[1758]: time=2025-04-15T15:58:28.987-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 15:58:28 launchpad ollama[1758]: time=2025-04-15T15:58:28.988-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 15:58:28 launchpad ollama[1758]: time=2025-04-15T15:58:28.989-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1509245173/runners
+Apr 15 15:58:31 launchpad ollama[1758]: time=2025-04-15T15:58:31.996-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 15 15:58:31 launchpad ollama[1758]: time=2025-04-15T15:58:31.998-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 15:58:31 launchpad ollama[1758]: time=2025-04-15T15:58:31.998-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:58:31 launchpad ollama[1758]: time=2025-04-15T15:58:31.998-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:58:31 launchpad ollama[1758]: time=2025-04-15T15:58:31.998-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 15:58:32 launchpad ollama[1758]: time=2025-04-15T15:58:32.224-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 16:20:25 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 16:20:25 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 16:20:25 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 16:20:25 launchpad systemd[1]: ollama.service: Consumed 3.464s CPU time, 786.9M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot e33af310e153423592a7dc42621bfb6b --
+Apr 15 16:21:05 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 16:21:05 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 16:21:05 launchpad ollama[1751]: 2025/04/15 16:21:05 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 16:21:05 launchpad ollama[1751]: time=2025-04-15T16:21:05.756-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 16:21:05 launchpad ollama[1751]: time=2025-04-15T16:21:05.764-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 16:21:05 launchpad ollama[1751]: time=2025-04-15T16:21:05.764-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 16:21:05 launchpad ollama[1751]: time=2025-04-15T16:21:05.766-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama134506118/runners
+Apr 15 16:21:08 launchpad ollama[1751]: time=2025-04-15T16:21:08.748-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 16:21:08 launchpad ollama[1751]: time=2025-04-15T16:21:08.749-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 16:21:08 launchpad ollama[1751]: time=2025-04-15T16:21:08.749-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 16:21:08 launchpad ollama[1751]: time=2025-04-15T16:21:08.750-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 16:21:08 launchpad ollama[1751]: time=2025-04-15T16:21:08.750-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 16:21:08 launchpad ollama[1751]: time=2025-04-15T16:21:08.959-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 16:41:30 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 16:41:30 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 16:41:30 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 16:41:30 launchpad systemd[1]: ollama.service: Consumed 3.412s CPU time, 787.8M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot ff2aa01d05b342759a68d202eb1e0194 --
+Apr 15 16:42:11 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 16:42:11 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 16:42:11 launchpad ollama[1765]: 2025/04/15 16:42:11 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 16:42:11 launchpad ollama[1765]: time=2025-04-15T16:42:11.520-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 16:42:11 launchpad ollama[1765]: time=2025-04-15T16:42:11.528-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 16:42:11 launchpad ollama[1765]: time=2025-04-15T16:42:11.530-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 16:42:11 launchpad ollama[1765]: time=2025-04-15T16:42:11.532-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3944000283/runners
+Apr 15 16:42:14 launchpad ollama[1765]: time=2025-04-15T16:42:14.569-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 15 16:42:14 launchpad ollama[1765]: time=2025-04-15T16:42:14.570-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 16:42:14 launchpad ollama[1765]: time=2025-04-15T16:42:14.570-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 16:42:14 launchpad ollama[1765]: time=2025-04-15T16:42:14.570-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 16:42:14 launchpad ollama[1765]: time=2025-04-15T16:42:14.570-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 16:42:14 launchpad ollama[1765]: time=2025-04-15T16:42:14.788-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 17:11:55 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 17:11:56 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 17:11:56 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 17:11:56 launchpad systemd[1]: ollama.service: Consumed 3.484s CPU time, 787M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 4d2c224ef067436aaa53e578ed6a3c23 --
+Apr 15 17:12:35 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 17:12:35 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 17:12:35 launchpad ollama[1756]: 2025/04/15 17:12:35 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 17:12:35 launchpad ollama[1756]: time=2025-04-15T17:12:35.952-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 17:12:35 launchpad ollama[1756]: time=2025-04-15T17:12:35.960-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 17:12:35 launchpad ollama[1756]: time=2025-04-15T17:12:35.961-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 17:12:35 launchpad ollama[1756]: time=2025-04-15T17:12:35.964-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama719220119/runners
+Apr 15 17:12:38 launchpad ollama[1756]: time=2025-04-15T17:12:38.950-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 17:12:38 launchpad ollama[1756]: time=2025-04-15T17:12:38.951-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 17:12:38 launchpad ollama[1756]: time=2025-04-15T17:12:38.951-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:12:38 launchpad ollama[1756]: time=2025-04-15T17:12:38.951-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:12:38 launchpad ollama[1756]: time=2025-04-15T17:12:38.951-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:12:39 launchpad ollama[1756]: time=2025-04-15T17:12:39.171-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 17:29:03 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 17:29:03 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 17:29:03 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 17:29:03 launchpad systemd[1]: ollama.service: Consumed 3.425s CPU time, 786.6M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 92afaf03a1c241d4b4e10462241fd95d --
+Apr 15 17:29:43 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 17:29:44 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 17:29:44 launchpad ollama[1753]: 2025/04/15 17:29:44 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 17:29:44 launchpad ollama[1753]: time=2025-04-15T17:29:44.267-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 17:29:44 launchpad ollama[1753]: time=2025-04-15T17:29:44.275-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 17:29:44 launchpad ollama[1753]: time=2025-04-15T17:29:44.276-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 17:29:44 launchpad ollama[1753]: time=2025-04-15T17:29:44.278-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama108880241/runners
+Apr 15 17:29:47 launchpad ollama[1753]: time=2025-04-15T17:29:47.320-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 15 17:29:47 launchpad ollama[1753]: time=2025-04-15T17:29:47.321-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 17:29:47 launchpad ollama[1753]: time=2025-04-15T17:29:47.321-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:29:47 launchpad ollama[1753]: time=2025-04-15T17:29:47.321-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:29:47 launchpad ollama[1753]: time=2025-04-15T17:29:47.321-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:29:47 launchpad ollama[1753]: time=2025-04-15T17:29:47.546-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 17:36:30 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 17:36:30 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 17:36:30 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 17:36:30 launchpad systemd[1]: ollama.service: Consumed 3.480s CPU time, 787.2M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot d88d7d2304fb4d7b86d2ef0adda036df --
+Apr 15 17:37:12 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 17:37:12 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 17:37:12 launchpad ollama[1755]: 2025/04/15 17:37:12 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 17:37:12 launchpad ollama[1755]: time=2025-04-15T17:37:12.496-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 17:37:12 launchpad ollama[1755]: time=2025-04-15T17:37:12.505-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 17:37:12 launchpad ollama[1755]: time=2025-04-15T17:37:12.506-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 17:37:12 launchpad ollama[1755]: time=2025-04-15T17:37:12.507-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2772590638/runners
+Apr 15 17:37:15 launchpad ollama[1755]: time=2025-04-15T17:37:15.564-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 15 17:37:15 launchpad ollama[1755]: time=2025-04-15T17:37:15.564-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 17:37:15 launchpad ollama[1755]: time=2025-04-15T17:37:15.565-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:37:15 launchpad ollama[1755]: time=2025-04-15T17:37:15.565-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:37:15 launchpad ollama[1755]: time=2025-04-15T17:37:15.565-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 17:37:15 launchpad ollama[1755]: time=2025-04-15T17:37:15.780-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 18:42:18 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 18:42:18 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 18:42:18 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 18:42:18 launchpad systemd[1]: ollama.service: Consumed 3.517s CPU time, 787.6M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot adf2dd4a7a96414d875044bff7cddcef --
+Apr 15 18:42:59 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 18:42:59 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 18:42:59 launchpad ollama[1766]: 2025/04/15 18:42:59 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 18:42:59 launchpad ollama[1766]: time=2025-04-15T18:42:59.304-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 18:42:59 launchpad ollama[1766]: time=2025-04-15T18:42:59.312-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 18:42:59 launchpad ollama[1766]: time=2025-04-15T18:42:59.313-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 18:42:59 launchpad ollama[1766]: time=2025-04-15T18:42:59.315-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3858422155/runners
+Apr 15 18:43:02 launchpad ollama[1766]: time=2025-04-15T18:43:02.348-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 15 18:43:02 launchpad ollama[1766]: time=2025-04-15T18:43:02.349-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 18:43:02 launchpad ollama[1766]: time=2025-04-15T18:43:02.349-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 18:43:02 launchpad ollama[1766]: time=2025-04-15T18:43:02.349-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 18:43:02 launchpad ollama[1766]: time=2025-04-15T18:43:02.349-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 18:43:02 launchpad ollama[1766]: time=2025-04-15T18:43:02.568-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 19:05:37 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 19:05:37 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 19:05:38 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 19:05:38 launchpad systemd[1]: ollama.service: Consumed 3.479s CPU time, 787M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot 56deafc783e24c26bad044f3835079cc --
+Apr 15 19:06:17 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 19:06:17 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 19:06:18 launchpad ollama[1751]: 2025/04/15 19:06:18 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 19:06:18 launchpad ollama[1751]: time=2025-04-15T19:06:18.103-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 19:06:18 launchpad ollama[1751]: time=2025-04-15T19:06:18.110-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 19:06:18 launchpad ollama[1751]: time=2025-04-15T19:06:18.111-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 19:06:18 launchpad ollama[1751]: time=2025-04-15T19:06:18.112-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1087723250/runners
+Apr 15 19:06:21 launchpad ollama[1751]: time=2025-04-15T19:06:21.155-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 19:06:21 launchpad ollama[1751]: time=2025-04-15T19:06:21.156-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 19:06:21 launchpad ollama[1751]: time=2025-04-15T19:06:21.156-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:06:21 launchpad ollama[1751]: time=2025-04-15T19:06:21.157-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:06:21 launchpad ollama[1751]: time=2025-04-15T19:06:21.157-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:06:21 launchpad ollama[1751]: time=2025-04-15T19:06:21.384-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 19:34:06 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 19:34:06 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 19:34:06 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 19:34:06 launchpad systemd[1]: ollama.service: Consumed 3.491s CPU time, 786.7M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 8b2c7e99e0864c20b11a5e72897b1d52 --
+Apr 15 19:34:52 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 19:34:52 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 19:34:53 launchpad ollama[1766]: 2025/04/15 19:34:53 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 19:34:53 launchpad ollama[1766]: time=2025-04-15T19:34:53.066-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 19:34:53 launchpad ollama[1766]: time=2025-04-15T19:34:53.075-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 19:34:53 launchpad ollama[1766]: time=2025-04-15T19:34:53.076-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 19:34:53 launchpad ollama[1766]: time=2025-04-15T19:34:53.078-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama955304025/runners
+Apr 15 19:34:56 launchpad ollama[1766]: time=2025-04-15T19:34:56.057-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 19:34:56 launchpad ollama[1766]: time=2025-04-15T19:34:56.058-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 19:34:56 launchpad ollama[1766]: time=2025-04-15T19:34:56.058-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:34:56 launchpad ollama[1766]: time=2025-04-15T19:34:56.058-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:34:56 launchpad ollama[1766]: time=2025-04-15T19:34:56.058-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:34:57 launchpad ollama[1766]: time=2025-04-15T19:34:57.812-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 15 19:35:18 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 19:35:18 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 19:35:18 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 19:35:18 launchpad systemd[1]: ollama.service: Consumed 5.278s CPU time, 787M memory peak, 234.2M read from disk, 508.1M written to disk.
+-- Boot 8c1bec5b01f44177a52bc758ca6ad5c4 --
+Apr 15 19:35:58 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 19:35:58 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 19:35:58 launchpad ollama[1756]: 2025/04/15 19:35:58 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 19:35:58 launchpad ollama[1756]: time=2025-04-15T19:35:58.593-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 19:35:58 launchpad ollama[1756]: time=2025-04-15T19:35:58.602-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 19:35:58 launchpad ollama[1756]: time=2025-04-15T19:35:58.603-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 19:35:58 launchpad ollama[1756]: time=2025-04-15T19:35:58.605-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama326345868/runners
+Apr 15 19:36:01 launchpad ollama[1756]: time=2025-04-15T19:36:01.642-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 19:36:01 launchpad ollama[1756]: time=2025-04-15T19:36:01.643-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 19:36:01 launchpad ollama[1756]: time=2025-04-15T19:36:01.643-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:36:01 launchpad ollama[1756]: time=2025-04-15T19:36:01.643-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:36:01 launchpad ollama[1756]: time=2025-04-15T19:36:01.643-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:36:01 launchpad ollama[1756]: time=2025-04-15T19:36:01.879-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 19:41:41 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 19:41:41 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 19:41:41 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 19:41:41 launchpad systemd[1]: ollama.service: Consumed 3.472s CPU time, 787.4M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 723cc3307f3b4b60b2a7d48a856c4333 --
+Apr 15 19:42:36 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 19:42:36 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 19:42:36 launchpad ollama[1766]: 2025/04/15 19:42:36 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 19:42:36 launchpad ollama[1766]: time=2025-04-15T19:42:36.558-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 19:42:36 launchpad ollama[1766]: time=2025-04-15T19:42:36.566-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 19:42:36 launchpad ollama[1766]: time=2025-04-15T19:42:36.567-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 19:42:36 launchpad ollama[1766]: time=2025-04-15T19:42:36.569-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama135249836/runners
+Apr 15 19:42:39 launchpad ollama[1766]: time=2025-04-15T19:42:39.498-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 15 19:42:39 launchpad ollama[1766]: time=2025-04-15T19:42:39.499-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 19:42:39 launchpad ollama[1766]: time=2025-04-15T19:42:39.499-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:42:39 launchpad ollama[1766]: time=2025-04-15T19:42:39.500-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:42:39 launchpad ollama[1766]: time=2025-04-15T19:42:39.500-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:42:41 launchpad ollama[1766]: time=2025-04-15T19:42:41.372-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 15 19:43:01 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 19:43:01 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 19:43:01 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 19:43:01 launchpad systemd[1]: ollama.service: Consumed 5.316s CPU time, 790.7M memory peak, 237.6M read from disk, 508.1M written to disk.
+-- Boot 046f08952da54b2a93bdfe876ed593e3 --
+Apr 15 19:43:43 launchpad systemd[1]: Starting Server for local large language models...
+Apr 15 19:43:44 launchpad systemd[1]: Started Server for local large language models.
+Apr 15 19:43:44 launchpad ollama[1763]: 2025/04/15 19:43:44 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 15 19:43:44 launchpad ollama[1763]: time=2025-04-15T19:43:44.136-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 15 19:43:44 launchpad ollama[1763]: time=2025-04-15T19:43:44.145-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 15 19:43:44 launchpad ollama[1763]: time=2025-04-15T19:43:44.146-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 15 19:43:44 launchpad ollama[1763]: time=2025-04-15T19:43:44.147-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2763501498/runners
+Apr 15 19:43:47 launchpad ollama[1763]: time=2025-04-15T19:43:47.184-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 15 19:43:47 launchpad ollama[1763]: time=2025-04-15T19:43:47.184-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 15 19:43:47 launchpad ollama[1763]: time=2025-04-15T19:43:47.185-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:43:47 launchpad ollama[1763]: time=2025-04-15T19:43:47.185-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:43:47 launchpad ollama[1763]: time=2025-04-15T19:43:47.185-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 15 19:43:47 launchpad ollama[1763]: time=2025-04-15T19:43:47.413-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 15 19:49:05 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 15 19:49:05 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 15 19:49:05 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 15 19:49:05 launchpad systemd[1]: ollama.service: Consumed 3.476s CPU time, 787.7M memory peak, 234.3M read from disk, 508.1M written to disk.
+-- Boot 7ec7b52398314a5089d714a76ff8aafd --
+Apr 16 08:49:25 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 08:49:26 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 08:49:26 launchpad ollama[1767]: 2025/04/16 08:49:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 08:49:26 launchpad ollama[1767]: time=2025-04-16T08:49:26.154-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 08:49:26 launchpad ollama[1767]: time=2025-04-16T08:49:26.161-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 08:49:26 launchpad ollama[1767]: time=2025-04-16T08:49:26.162-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 08:49:26 launchpad ollama[1767]: time=2025-04-16T08:49:26.163-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1336750794/runners
+Apr 16 08:49:29 launchpad ollama[1767]: time=2025-04-16T08:49:29.130-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 16 08:49:29 launchpad ollama[1767]: time=2025-04-16T08:49:29.130-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 08:49:29 launchpad ollama[1767]: time=2025-04-16T08:49:29.130-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 08:49:29 launchpad ollama[1767]: time=2025-04-16T08:49:29.131-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 08:49:29 launchpad ollama[1767]: time=2025-04-16T08:49:29.131-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 08:49:29 launchpad ollama[1767]: time=2025-04-16T08:49:29.356-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 09:52:34 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 09:52:34 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 09:52:34 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 09:52:34 launchpad systemd[1]: ollama.service: Consumed 3.443s CPU time, 786.7M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot 67348c505ba8474fac5f64b2b50ede37 --
+Apr 16 09:53:08 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 09:53:08 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 09:53:08 launchpad ollama[1758]: 2025/04/16 09:53:08 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 09:53:08 launchpad ollama[1758]: time=2025-04-16T09:53:08.788-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 09:53:08 launchpad ollama[1758]: time=2025-04-16T09:53:08.796-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 09:53:08 launchpad ollama[1758]: time=2025-04-16T09:53:08.797-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 09:53:08 launchpad ollama[1758]: time=2025-04-16T09:53:08.798-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3764926553/runners
+Apr 16 09:53:11 launchpad ollama[1758]: time=2025-04-16T09:53:11.773-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 16 09:53:11 launchpad ollama[1758]: time=2025-04-16T09:53:11.774-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 09:53:11 launchpad ollama[1758]: time=2025-04-16T09:53:11.774-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 09:53:11 launchpad ollama[1758]: time=2025-04-16T09:53:11.775-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 09:53:11 launchpad ollama[1758]: time=2025-04-16T09:53:11.775-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 09:53:11 launchpad ollama[1758]: time=2025-04-16T09:53:11.991-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 09:58:32 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 09:58:32 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 09:58:32 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 09:58:32 launchpad systemd[1]: ollama.service: Consumed 3.403s CPU time, 787.2M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot 22011e926bda407082bdad0deef3251d --
+Apr 16 09:59:04 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 09:59:04 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 09:59:04 launchpad ollama[1758]: 2025/04/16 09:59:04 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 09:59:04 launchpad ollama[1758]: time=2025-04-16T09:59:04.576-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 09:59:04 launchpad ollama[1758]: time=2025-04-16T09:59:04.586-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 09:59:04 launchpad ollama[1758]: time=2025-04-16T09:59:04.586-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 09:59:04 launchpad ollama[1758]: time=2025-04-16T09:59:04.587-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3002500503/runners
+Apr 16 09:59:07 launchpad ollama[1758]: time=2025-04-16T09:59:07.616-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 16 09:59:07 launchpad ollama[1758]: time=2025-04-16T09:59:07.617-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 09:59:07 launchpad ollama[1758]: time=2025-04-16T09:59:07.617-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 09:59:07 launchpad ollama[1758]: time=2025-04-16T09:59:07.618-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 09:59:07 launchpad ollama[1758]: time=2025-04-16T09:59:07.618-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 09:59:07 launchpad ollama[1758]: time=2025-04-16T09:59:07.833-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 10:04:42 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 10:04:42 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 10:04:42 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 10:04:42 launchpad systemd[1]: ollama.service: Consumed 3.454s CPU time, 787.2M memory peak, 234.5M read from disk, 508.1M written to disk.
+-- Boot 1cdc3f1f0ea5475cae0edd2be2a5d86b --
+Apr 16 10:05:14 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 10:05:14 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 10:05:14 launchpad ollama[1763]: 2025/04/16 10:05:14 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 10:05:14 launchpad ollama[1763]: time=2025-04-16T10:05:14.681-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 10:05:14 launchpad ollama[1763]: time=2025-04-16T10:05:14.691-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 10:05:14 launchpad ollama[1763]: time=2025-04-16T10:05:14.692-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 10:05:14 launchpad ollama[1763]: time=2025-04-16T10:05:14.693-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2353227385/runners
+Apr 16 10:05:17 launchpad ollama[1763]: time=2025-04-16T10:05:17.705-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 16 10:05:17 launchpad ollama[1763]: time=2025-04-16T10:05:17.705-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 10:05:17 launchpad ollama[1763]: time=2025-04-16T10:05:17.706-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:05:17 launchpad ollama[1763]: time=2025-04-16T10:05:17.706-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:05:17 launchpad ollama[1763]: time=2025-04-16T10:05:17.706-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:05:17 launchpad ollama[1763]: time=2025-04-16T10:05:17.920-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 10:16:57 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 10:16:57 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 10:16:57 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 10:16:57 launchpad systemd[1]: ollama.service: Consumed 3.459s CPU time, 786.6M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot e43d69d9549746058c4c3cef5f453029 --
+Apr 16 10:17:29 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 10:17:29 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 10:17:29 launchpad ollama[1751]: 2025/04/16 10:17:29 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 10:17:29 launchpad ollama[1751]: time=2025-04-16T10:17:29.306-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 10:17:29 launchpad ollama[1751]: time=2025-04-16T10:17:29.315-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 10:17:29 launchpad ollama[1751]: time=2025-04-16T10:17:29.316-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 10:17:29 launchpad ollama[1751]: time=2025-04-16T10:17:29.317-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1196898257/runners
+Apr 16 10:17:32 launchpad ollama[1751]: time=2025-04-16T10:17:32.352-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 16 10:17:32 launchpad ollama[1751]: time=2025-04-16T10:17:32.352-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 10:17:32 launchpad ollama[1751]: time=2025-04-16T10:17:32.353-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:17:32 launchpad ollama[1751]: time=2025-04-16T10:17:32.353-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:17:32 launchpad ollama[1751]: time=2025-04-16T10:17:32.353-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:17:32 launchpad ollama[1751]: time=2025-04-16T10:17:32.566-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 10:28:53 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 10:28:54 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 10:28:54 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 10:28:54 launchpad systemd[1]: ollama.service: Consumed 3.429s CPU time, 786.8M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot c129e31b363247f9b053a7c6135bcb92 --
+Apr 16 10:29:25 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 10:29:25 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 10:29:26 launchpad ollama[1756]: 2025/04/16 10:29:26 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 10:29:26 launchpad ollama[1756]: time=2025-04-16T10:29:26.108-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 10:29:26 launchpad ollama[1756]: time=2025-04-16T10:29:26.116-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 10:29:26 launchpad ollama[1756]: time=2025-04-16T10:29:26.117-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 10:29:26 launchpad ollama[1756]: time=2025-04-16T10:29:26.118-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1140884484/runners
+Apr 16 10:29:29 launchpad ollama[1756]: time=2025-04-16T10:29:29.150-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx2 cuda_v12 cpu cpu_avx]"
+Apr 16 10:29:29 launchpad ollama[1756]: time=2025-04-16T10:29:29.152-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 10:29:29 launchpad ollama[1756]: time=2025-04-16T10:29:29.152-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:29:29 launchpad ollama[1756]: time=2025-04-16T10:29:29.152-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:29:29 launchpad ollama[1756]: time=2025-04-16T10:29:29.152-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:29:29 launchpad ollama[1756]: time=2025-04-16T10:29:29.373-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 10:49:41 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 10:49:41 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 10:49:41 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 10:49:41 launchpad systemd[1]: ollama.service: Consumed 3.488s CPU time, 786.7M memory peak, 233.9M read from disk, 508.1M written to disk.
+-- Boot b7737e9169e547b3ba5bad348f5ca7cf --
+Apr 16 10:50:13 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 10:50:13 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 10:50:13 launchpad ollama[1541]: 2025/04/16 10:50:13 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 10:50:13 launchpad ollama[1541]: time=2025-04-16T10:50:13.381-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 10:50:13 launchpad ollama[1541]: time=2025-04-16T10:50:13.393-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 10:50:13 launchpad ollama[1541]: time=2025-04-16T10:50:13.393-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 10:50:13 launchpad ollama[1541]: time=2025-04-16T10:50:13.396-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2767395511/runners
+Apr 16 10:50:16 launchpad ollama[1541]: time=2025-04-16T10:50:16.432-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Apr 16 10:50:16 launchpad ollama[1541]: time=2025-04-16T10:50:16.433-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 10:50:16 launchpad ollama[1541]: time=2025-04-16T10:50:16.434-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:50:16 launchpad ollama[1541]: time=2025-04-16T10:50:16.434-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:50:16 launchpad ollama[1541]: time=2025-04-16T10:50:16.434-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 10:50:16 launchpad ollama[1541]: time=2025-04-16T10:50:16.665-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 16 12:09:45 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 16 12:09:45 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 16 12:09:45 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 16 12:09:45 launchpad systemd[1]: ollama.service: Consumed 3.522s CPU time, 787M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot d5d86d060a234fe6a240e2a1ffc43b8e --
+Apr 16 12:10:20 launchpad systemd[1]: Starting Server for local large language models...
+Apr 16 12:10:20 launchpad systemd[1]: Started Server for local large language models.
+Apr 16 12:10:20 launchpad ollama[1547]: 2025/04/16 12:10:20 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 16 12:10:20 launchpad ollama[1547]: time=2025-04-16T12:10:20.428-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 16 12:10:20 launchpad ollama[1547]: time=2025-04-16T12:10:20.440-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 16 12:10:20 launchpad ollama[1547]: time=2025-04-16T12:10:20.441-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 16 12:10:20 launchpad ollama[1547]: time=2025-04-16T12:10:20.443-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1129470658/runners
+Apr 16 12:10:23 launchpad ollama[1547]: time=2025-04-16T12:10:23.564-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cuda_v12 cpu cpu_avx cpu_avx2]"
+Apr 16 12:10:23 launchpad ollama[1547]: time=2025-04-16T12:10:23.565-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 16 12:10:23 launchpad ollama[1547]: time=2025-04-16T12:10:23.565-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 12:10:23 launchpad ollama[1547]: time=2025-04-16T12:10:23.566-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 12:10:23 launchpad ollama[1547]: time=2025-04-16T12:10:23.566-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 16 12:10:23 launchpad ollama[1547]: time=2025-04-16T12:10:23.801-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 17 08:02:06 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 17 08:02:06 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 17 08:02:06 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 17 08:02:06 launchpad systemd[1]: ollama.service: Consumed 3.847s CPU time, 787.4M memory peak, 233.7M read from disk, 508.1M written to disk.
+-- Boot da5d640ed2764ad2ab3162ee265faccc --
+Apr 17 08:02:39 launchpad systemd[1]: Starting Server for local large language models...
+Apr 17 08:02:39 launchpad systemd[1]: Started Server for local large language models.
+Apr 17 08:02:39 launchpad ollama[1558]: 2025/04/17 08:02:39 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 17 08:02:39 launchpad ollama[1558]: time=2025-04-17T08:02:39.459-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 17 08:02:39 launchpad ollama[1558]: time=2025-04-17T08:02:39.468-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 17 08:02:39 launchpad ollama[1558]: time=2025-04-17T08:02:39.469-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 17 08:02:39 launchpad ollama[1558]: time=2025-04-17T08:02:39.471-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3039898910/runners
+Apr 17 08:02:42 launchpad ollama[1558]: time=2025-04-17T08:02:42.503-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 17 08:02:42 launchpad ollama[1558]: time=2025-04-17T08:02:42.503-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 17 08:02:42 launchpad ollama[1558]: time=2025-04-17T08:02:42.504-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:02:42 launchpad ollama[1558]: time=2025-04-17T08:02:42.504-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:02:42 launchpad ollama[1558]: time=2025-04-17T08:02:42.504-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:02:42 launchpad ollama[1558]: time=2025-04-17T08:02:42.720-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 17 08:04:18 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 17 08:04:19 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 17 08:04:19 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 17 08:04:19 launchpad systemd[1]: ollama.service: Consumed 3.456s CPU time, 787.4M memory peak, 234.1M read from disk, 508.1M written to disk.
+-- Boot 8f7476c1ba934d1db993b63d96926b29 --
+Apr 17 08:04:55 launchpad systemd[1]: Starting Server for local large language models...
+Apr 17 08:04:55 launchpad systemd[1]: Started Server for local large language models.
+Apr 17 08:04:55 launchpad ollama[1554]: 2025/04/17 08:04:55 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 17 08:04:55 launchpad ollama[1554]: time=2025-04-17T08:04:55.299-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 17 08:04:55 launchpad ollama[1554]: time=2025-04-17T08:04:55.310-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 17 08:04:55 launchpad ollama[1554]: time=2025-04-17T08:04:55.311-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 17 08:04:55 launchpad ollama[1554]: time=2025-04-17T08:04:55.313-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama2590820481/runners
+Apr 17 08:04:58 launchpad ollama[1554]: time=2025-04-17T08:04:58.342-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 17 08:04:58 launchpad ollama[1554]: time=2025-04-17T08:04:58.342-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 17 08:04:58 launchpad ollama[1554]: time=2025-04-17T08:04:58.342-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:04:58 launchpad ollama[1554]: time=2025-04-17T08:04:58.343-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:04:58 launchpad ollama[1554]: time=2025-04-17T08:04:58.343-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:04:58 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 17 08:04:58 launchpad ollama[1554]: time=2025-04-17T08:04:58.850-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.4 GiB"
+Apr 17 08:04:59 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 17 08:04:59 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 17 08:04:59 launchpad systemd[1]: ollama.service: Consumed 3.448s CPU time, 767.6M memory peak, 234.3M read from disk, 507.3M written to disk.
+-- Boot 758f89d306924c14ac8df7085e6f4c8c --
+Apr 17 08:05:41 launchpad systemd[1]: Starting Server for local large language models...
+Apr 17 08:05:41 launchpad systemd[1]: Started Server for local large language models.
+Apr 17 08:05:41 launchpad ollama[1547]: 2025/04/17 08:05:41 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 17 08:05:41 launchpad ollama[1547]: time=2025-04-17T08:05:41.421-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 17 08:05:41 launchpad ollama[1547]: time=2025-04-17T08:05:41.431-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 17 08:05:41 launchpad ollama[1547]: time=2025-04-17T08:05:41.432-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 17 08:05:41 launchpad ollama[1547]: time=2025-04-17T08:05:41.433-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama3042380030/runners
+Apr 17 08:05:44 launchpad ollama[1547]: time=2025-04-17T08:05:44.420-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 17 08:05:44 launchpad ollama[1547]: time=2025-04-17T08:05:44.420-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 17 08:05:44 launchpad ollama[1547]: time=2025-04-17T08:05:44.420-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:05:44 launchpad ollama[1547]: time=2025-04-17T08:05:44.421-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:05:44 launchpad ollama[1547]: time=2025-04-17T08:05:44.421-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:05:44 launchpad ollama[1547]: time=2025-04-17T08:05:44.650-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.8 GiB"
+Apr 17 08:18:18 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 17 08:18:19 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 17 08:18:19 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 17 08:18:19 launchpad systemd[1]: ollama.service: Consumed 3.448s CPU time, 787.4M memory peak, 234.6M read from disk, 508.1M written to disk.
+-- Boot a7504e0bc21940739f8f68c60693dfbc --
+Apr 17 08:18:57 launchpad systemd[1]: Starting Server for local large language models...
+Apr 17 08:18:57 launchpad systemd[1]: Started Server for local large language models.
+Apr 17 08:18:57 launchpad ollama[1563]: 2025/04/17 08:18:57 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 17 08:18:57 launchpad ollama[1563]: time=2025-04-17T08:18:57.484-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 17 08:18:57 launchpad ollama[1563]: time=2025-04-17T08:18:57.493-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 17 08:18:57 launchpad ollama[1563]: time=2025-04-17T08:18:57.495-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 17 08:18:57 launchpad ollama[1563]: time=2025-04-17T08:18:57.495-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1624591756/runners
+Apr 17 08:19:00 launchpad ollama[1563]: time=2025-04-17T08:19:00.473-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu_avx cpu_avx2 cuda_v12 cpu]"
+Apr 17 08:19:00 launchpad ollama[1563]: time=2025-04-17T08:19:00.473-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 17 08:19:00 launchpad ollama[1563]: time=2025-04-17T08:19:00.473-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:19:00 launchpad ollama[1563]: time=2025-04-17T08:19:00.474-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:19:00 launchpad ollama[1563]: time=2025-04-17T08:19:00.474-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:19:02 launchpad ollama[1563]: time=2025-04-17T08:19:02.187-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
+Apr 17 08:19:12 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 17 08:19:12 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 17 08:19:12 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 17 08:19:12 launchpad systemd[1]: ollama.service: Consumed 5.391s CPU time, 787.3M memory peak, 234M read from disk, 508.1M written to disk.
+-- Boot 2d85acb4569a4560b5355bd9c65e285d --
+Apr 17 08:19:44 launchpad systemd[1]: Starting Server for local large language models...
+Apr 17 08:19:44 launchpad systemd[1]: Started Server for local large language models.
+Apr 17 08:19:44 launchpad ollama[1550]: 2025/04/17 08:19:44 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 17 08:19:44 launchpad ollama[1550]: time=2025-04-17T08:19:44.390-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 17 08:19:44 launchpad ollama[1550]: time=2025-04-17T08:19:44.398-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 17 08:19:44 launchpad ollama[1550]: time=2025-04-17T08:19:44.399-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 17 08:19:44 launchpad ollama[1550]: time=2025-04-17T08:19:44.400-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama1046240382/runners
+Apr 17 08:19:47 launchpad ollama[1550]: time=2025-04-17T08:19:47.441-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 17 08:19:47 launchpad ollama[1550]: time=2025-04-17T08:19:47.442-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 17 08:19:47 launchpad ollama[1550]: time=2025-04-17T08:19:47.442-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:19:47 launchpad ollama[1550]: time=2025-04-17T08:19:47.443-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:19:47 launchpad ollama[1550]: time=2025-04-17T08:19:47.443-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:19:47 launchpad ollama[1550]: time=2025-04-17T08:19:47.665-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 17 08:33:33 launchpad systemd[1]: Stopping Server for local large language models...
+Apr 17 08:33:33 launchpad systemd[1]: ollama.service: Deactivated successfully.
+Apr 17 08:33:33 launchpad systemd[1]: Stopped Server for local large language models.
+Apr 17 08:33:33 launchpad systemd[1]: ollama.service: Consumed 3.482s CPU time, 787.4M memory peak, 233.8M read from disk, 508.1M written to disk.
+-- Boot 587a9d72538846239e9329ff2ac5527d --
+Apr 17 08:34:06 launchpad systemd[1]: Starting Server for local large language models...
+Apr 17 08:34:06 launchpad systemd[1]: Started Server for local large language models.
+Apr 17 08:34:06 launchpad ollama[1550]: 2025/04/17 08:34:06 routes.go:1153: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/var/lib/ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
+Apr 17 08:34:06 launchpad ollama[1550]: time=2025-04-17T08:34:06.375-07:00 level=INFO source=images.go:753 msg="total blobs: 33"
+Apr 17 08:34:06 launchpad ollama[1550]: time=2025-04-17T08:34:06.385-07:00 level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+Apr 17 08:34:06 launchpad ollama[1550]: time=2025-04-17T08:34:06.387-07:00 level=INFO source=routes.go:1200 msg="Listening on 127.0.0.1:11434 (version 0.3.12)"
+Apr 17 08:34:06 launchpad ollama[1550]: time=2025-04-17T08:34:06.388-07:00 level=INFO source=common.go:135 msg="extracting embedded files" dir=/tmp/ollama568753210/runners
+Apr 17 08:34:09 launchpad ollama[1550]: time=2025-04-17T08:34:09.423-07:00 level=INFO source=common.go:49 msg="Dynamic LLM libraries" runners="[cpu cpu_avx cpu_avx2 cuda_v12]"
+Apr 17 08:34:09 launchpad ollama[1550]: time=2025-04-17T08:34:09.424-07:00 level=INFO source=gpu.go:199 msg="looking for compatible GPUs"
+Apr 17 08:34:09 launchpad ollama[1550]: time=2025-04-17T08:34:09.424-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:34:09 launchpad ollama[1550]: time=2025-04-17T08:34:09.425-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:34:09 launchpad ollama[1550]: time=2025-04-17T08:34:09.425-07:00 level=WARN source=gpu.go:669 msg="unable to locate gpu dependency libraries"
+Apr 17 08:34:09 launchpad ollama[1550]: time=2025-04-17T08:34:09.638-07:00 level=INFO source=types.go:107 msg="inference compute" id=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda variant=v12 compute=8.6 driver=12.7 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="10.9 GiB"
+Apr 17 13:16:34 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:16:34 | 200 |    3.804253ms |       127.0.0.1 | GET      "/api/tags"
+Apr 17 13:16:35 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:16:35 | 200 |     601.213µs |       127.0.0.1 | GET      "/api/tags"
+Apr 17 13:16:35 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:16:35 | 200 |       37.78µs |       127.0.0.1 | GET      "/api/version"
+Apr 17 13:16:37 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:16:37 | 200 |      26.101µs |       127.0.0.1 | GET      "/api/version"
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.448-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=1 available=8983150592 required="7.7 GiB"
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.448-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.448-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="7.7 GiB" memory.required.partial="7.7 GiB" memory.required.kv="2.0 GiB" memory.required.allocations="[7.7 GiB]" memory.weights.total="5.7 GiB" memory.weights.repeating="5.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="1.1 GiB" memory.graph.partial="1.1 GiB"
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.449-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 16384 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 35895"
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.450-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.450-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.450-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 13:17:21 launchpad ollama[67599]: INFO [main] build info | build=0 commit="unknown" tid="139854789881856" timestamp=1744921041
+Apr 17 13:17:21 launchpad ollama[67599]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139854789881856" timestamp=1744921041 total_threads=16
+Apr 17 13:17:21 launchpad ollama[67599]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35895" tid="139854789881856" timestamp=1744921041
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 13:17:21 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 13:17:21 launchpad ollama[1550]: time=2025-04-17T13:17:21.701-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 13:17:21 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 13:17:21 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 13:17:21 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 13:17:21 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 13:17:21 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 13:17:27 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 13:17:27 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 13:17:27 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 16384
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 13:17:27 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  2048.00 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.50 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =  1088.00 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    40.01 MiB
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 13:17:27 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 13:17:27 launchpad ollama[67599]: INFO [main] model loaded | tid="139854789881856" timestamp=1744921047
+Apr 17 13:17:27 launchpad ollama[1550]: time=2025-04-17T13:17:27.719-07:00 level=INFO source=server.go:626 msg="llama runner started in 6.27 seconds"
+Apr 17 13:17:30 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:17:30 | 200 |  9.126188526s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:17:30 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:17:30 | 200 |  426.463896ms |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:17:32 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:17:32 | 200 |  1.669589102s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.617-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8999927808 required="6.2 GiB"
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.617-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.617-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.619-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33245"
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.619-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.619-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.619-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 13:18:11 launchpad ollama[67755]: INFO [main] build info | build=0 commit="unknown" tid="139715506327552" timestamp=1744921091
+Apr 17 13:18:11 launchpad ollama[67755]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139715506327552" timestamp=1744921091 total_threads=16
+Apr 17 13:18:11 launchpad ollama[67755]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33245" tid="139715506327552" timestamp=1744921091
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 13:18:11 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 13:18:11 launchpad ollama[1550]: time=2025-04-17T13:18:11.870-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 13:18:11 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 13:18:11 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 13:18:11 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 13:18:11 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 13:18:11 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 13:18:12 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 13:18:12 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 13:18:12 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 13:18:12 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 13:18:12 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 13:18:12 launchpad ollama[67755]: INFO [main] model loaded | tid="139715506327552" timestamp=1744921092
+Apr 17 13:18:12 launchpad ollama[1550]: time=2025-04-17T13:18:12.874-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 17 13:18:17 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:18:17 | 200 |   6.61916553s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:20:18 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:20:18 | 200 |  4.209518826s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.877-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9026928640 required="6.2 GiB"
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.878-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.4 GiB" free_swap="68.9 GiB"
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.878-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.879-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39669"
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.879-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.879-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 13:32:03 launchpad ollama[1550]: time=2025-04-17T13:32:03.879-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 13:32:03 launchpad ollama[69966]: INFO [main] build info | build=0 commit="unknown" tid="140277726531584" timestamp=1744921923
+Apr 17 13:32:03 launchpad ollama[69966]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140277726531584" timestamp=1744921923 total_threads=16
+Apr 17 13:32:03 launchpad ollama[69966]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39669" tid="140277726531584" timestamp=1744921923
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 13:32:03 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 13:32:04 launchpad ollama[1550]: time=2025-04-17T13:32:04.131-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 13:32:04 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 13:32:04 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 13:32:04 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 13:32:04 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 13:32:04 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 13:32:04 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 13:32:04 launchpad ollama[69966]: INFO [main] model loaded | tid="140277726531584" timestamp=1744921924
+Apr 17 13:32:05 launchpad ollama[1550]: time=2025-04-17T13:32:05.135-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 17 13:32:14 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:32:14 | 200 | 11.216938661s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.856-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9027059712 required="6.2 GiB"
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.856-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.856-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.857-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45415"
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.857-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.857-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 13:46:06 launchpad ollama[1550]: time=2025-04-17T13:46:06.858-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 13:46:06 launchpad ollama[72256]: INFO [main] build info | build=0 commit="unknown" tid="139958814859264" timestamp=1744922766
+Apr 17 13:46:06 launchpad ollama[72256]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139958814859264" timestamp=1744922766 total_threads=16
+Apr 17 13:46:06 launchpad ollama[72256]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45415" tid="139958814859264" timestamp=1744922766
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 13:46:06 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 13:46:07 launchpad ollama[1550]: time=2025-04-17T13:46:07.109-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 13:46:07 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 13:46:07 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 13:46:07 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 13:46:07 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 13:46:07 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 13:46:07 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 13:46:07 launchpad ollama[72256]: INFO [main] model loaded | tid="139958814859264" timestamp=1744922767
+Apr 17 13:46:08 launchpad ollama[1550]: time=2025-04-17T13:46:08.112-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 17 13:46:18 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:46:18 | 200 | 11.746749074s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.266-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9132834816 required="6.2 GiB"
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.266-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.3 GiB" free_swap="68.9 GiB"
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.267-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.268-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44429"
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.268-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.268-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.268-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 13:52:39 launchpad ollama[73282]: INFO [main] build info | build=0 commit="unknown" tid="139760036306944" timestamp=1744923159
+Apr 17 13:52:39 launchpad ollama[73282]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139760036306944" timestamp=1744923159 total_threads=16
+Apr 17 13:52:39 launchpad ollama[73282]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44429" tid="139760036306944" timestamp=1744923159
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 13:52:39 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 13:52:39 launchpad ollama[1550]: time=2025-04-17T13:52:39.519-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 13:52:39 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 13:52:39 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 13:52:39 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 13:52:39 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 13:52:39 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 13:52:40 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 13:52:40 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 13:52:40 launchpad ollama[73282]: INFO [main] model loaded | tid="139760036306944" timestamp=1744923160
+Apr 17 13:52:40 launchpad ollama[1550]: time=2025-04-17T13:52:40.522-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 17 13:52:49 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:52:49 | 200 | 10.664116085s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.891-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8997306368 required="6.2 GiB"
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.891-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.892-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.893-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42783"
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.893-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.893-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 13:59:27 launchpad ollama[1550]: time=2025-04-17T13:59:27.893-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 13:59:27 launchpad ollama[74490]: INFO [main] build info | build=0 commit="unknown" tid="139996302262272" timestamp=1744923567
+Apr 17 13:59:27 launchpad ollama[74490]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139996302262272" timestamp=1744923567 total_threads=16
+Apr 17 13:59:27 launchpad ollama[74490]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42783" tid="139996302262272" timestamp=1744923567
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 13:59:27 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 13:59:28 launchpad ollama[1550]: time=2025-04-17T13:59:28.144-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 13:59:28 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 13:59:28 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 13:59:28 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 13:59:28 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 13:59:28 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 13:59:28 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 13:59:28 launchpad ollama[74490]: INFO [main] model loaded | tid="139996302262272" timestamp=1744923568
+Apr 17 13:59:29 launchpad ollama[1550]: time=2025-04-17T13:59:29.147-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 17 13:59:36 launchpad ollama[1550]: [GIN] 2025/04/17 - 13:59:36 | 200 |  8.726327132s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:00:16 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:00:16 | 200 |  4.472001558s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:00:21 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:00:21 | 200 |  4.418741149s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:00:24 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:00:24 | 200 |  2.612630528s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:02:11 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:02:11 | 200 |  1.208514073s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:02:12 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:02:12 | 200 |  177.531961ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:02:17 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:02:17 | 200 |  178.745028ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:02:18 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:02:18 | 200 |  835.093664ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:02:21 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:02:21 | 200 |  3.020288958s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:06:20 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:20 | 200 |  3.408414633s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:24 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1234 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744923984
+Apr 17 14:06:25 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:25 | 200 |  1.077575454s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:06:26 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:26 | 200 |  996.106633ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:27 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:27 | 200 |  996.704268ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:28 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:28 | 200 |  938.107467ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:29 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:29 | 200 |   915.24396ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:30 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:30 | 200 |  998.805303ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:31 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:31 | 200 |  1.384641956s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:06:38 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:06:38 | 200 |  6.897968821s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:07:44 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=5065 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924064
+Apr 17 14:07:50 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:50 | 200 |  6.262280302s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:07:50 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924070
+Apr 17 14:07:51 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:51 | 200 |  1.178453488s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:07:51 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924071
+Apr 17 14:07:52 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:52 | 200 |  1.176354245s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:07:53 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:53 | 200 |  1.012140902s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:07:53 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924073
+Apr 17 14:07:54 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:54 | 200 |  1.057069122s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:07:55 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924075
+Apr 17 14:07:56 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:56 | 200 |  1.141288272s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:07:57 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:07:57 | 200 |  1.506940093s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:07:57 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1392 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924077
+Apr 17 14:08:02 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:08:02 | 200 |  4.822690255s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:09:19 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1864 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924159
+Apr 17 14:09:22 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:22 | 200 |  3.301964076s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:09:23 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:23 | 200 |  177.759381ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:09:23 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:23 | 200 |  175.241286ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:09:24 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:24 | 200 |  938.075968ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:09:24 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:24 | 200 |   95.604798ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:09:24 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:24 | 200 |   175.74611ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:09:25 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:25 | 200 |  1.302571234s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:09:31 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:09:31 | 200 |  6.045540856s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:10:44 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:44 | 200 |  1.275281542s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:10:46 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:46 | 200 |  179.586836ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:10:46 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:46 | 200 |  182.940639ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:10:47 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:47 | 200 |  919.367071ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:10:47 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:47 | 200 |   95.890195ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:10:48 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:48 | 200 |  177.006476ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:10:49 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:49 | 200 |  1.202086651s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:10:54 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:10:54 | 200 |  5.428032754s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:14:12 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:12 | 200 |   1.15313724s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:14:13 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:13 | 200 |  198.856371ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:14:13 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:13 | 200 |  199.621843ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:14:14 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:14 | 200 |  1.169946923s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:14:14 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:14 | 200 |   77.287107ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:14:14 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:14 | 200 |  199.272619ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:14:15 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:15 | 200 |  1.118591732s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:14:23 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:14:23 | 200 |  7.988811445s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:16:56 launchpad ollama[74490]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1485 n_keep=24 n_left=2024 n_shift=1012 tid="139996302262272" timestamp=1744924616
+Apr 17 14:17:04 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:04 | 200 |  7.620892723s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:17:05 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:05 | 200 |   1.01311995s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:17:06 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:06 | 200 |  1.013174825s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:17:07 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:07 | 200 |  851.051034ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:17:07 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:07 | 200 |  932.455377ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:17:08 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:08 | 200 |  971.281598ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:17:10 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:10 | 200 |  1.430668485s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:17:17 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:17:17 | 200 |  7.480247037s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.811-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9012707328 required="6.2 GiB"
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.811-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.1 GiB" free_swap="68.9 GiB"
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.812-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.813-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33917"
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.813-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.813-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 14:22:25 launchpad ollama[1550]: time=2025-04-17T14:22:25.813-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 14:22:25 launchpad ollama[78330]: INFO [main] build info | build=0 commit="unknown" tid="139956753145856" timestamp=1744924945
+Apr 17 14:22:25 launchpad ollama[78330]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139956753145856" timestamp=1744924945 total_threads=16
+Apr 17 14:22:25 launchpad ollama[78330]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33917" tid="139956753145856" timestamp=1744924945
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 14:22:25 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 14:22:26 launchpad ollama[1550]: time=2025-04-17T14:22:26.063-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 14:22:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 14:22:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 14:22:26 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 14:22:26 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 14:22:26 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 14:22:26 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 14:22:26 launchpad ollama[78330]: INFO [main] model loaded | tid="139956753145856" timestamp=1744924946
+Apr 17 14:22:27 launchpad ollama[1550]: time=2025-04-17T14:22:27.067-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 17 14:22:29 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:29 | 200 |  3.836944271s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:22:32 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:32 | 200 |  194.112954ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:22:32 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:32 | 200 |  193.831037ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:22:33 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:33 | 200 |  833.424983ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:22:33 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:33 | 200 |  106.871435ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:22:33 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:33 | 200 |  189.877355ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:22:34 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:34 | 200 |  940.721332ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:22:43 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:22:43 | 200 |  8.678793145s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:26:25 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:25 | 200 |  1.458377354s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:26:26 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:26 | 200 |  218.019292ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:26:26 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:26 | 200 |  216.401377ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:26:27 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:27 | 200 |  1.138773958s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:26:27 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:27 | 200 |   95.320243ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:26:27 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:27 | 200 |  217.148759ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:26:28 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:28 | 200 |  1.116623902s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:26:34 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:26:34 | 200 |  6.081919408s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.672-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9008578560 required="6.2 GiB"
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.672-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.9 GiB" free_swap="68.9 GiB"
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.673-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.673-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46437"
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.674-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.674-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.674-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 17 14:34:50 launchpad ollama[80427]: INFO [main] build info | build=0 commit="unknown" tid="139638404534272" timestamp=1744925690
+Apr 17 14:34:50 launchpad ollama[80427]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139638404534272" timestamp=1744925690 total_threads=16
+Apr 17 14:34:50 launchpad ollama[80427]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46437" tid="139638404534272" timestamp=1744925690
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 17 14:34:50 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 17 14:34:50 launchpad ollama[1550]: time=2025-04-17T14:34:50.925-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 17 14:34:50 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 17 14:34:50 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 17 14:34:50 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 17 14:34:50 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 17 14:34:50 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 17 14:34:51 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 17 14:34:51 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 17 14:34:51 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 17 14:34:51 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 17 14:34:51 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 17 14:34:51 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 17 14:34:51 launchpad ollama[80427]: INFO [main] model loaded | tid="139638404534272" timestamp=1744925691
+Apr 17 14:34:51 launchpad ollama[1550]: time=2025-04-17T14:34:51.928-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 17 14:34:52 launchpad ollama[80427]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=2854 n_keep=24 n_left=2024 n_shift=1012 tid="139638404534272" timestamp=1744925692
+Apr 17 14:34:58 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:34:58 | 200 |  8.378664871s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:35:00 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:00 | 200 |  1.390434188s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:35:01 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:01 | 200 |   1.38910173s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:35:02 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:02 | 200 |  819.416884ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:35:03 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:03 | 200 |  1.350342709s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:35:05 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:05 | 200 |  1.390120929s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:35:06 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:06 | 200 |  947.665362ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:35:12 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:35:12 | 200 |  6.235460047s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:37:55 launchpad ollama[80427]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1140 n_keep=24 n_left=2024 n_shift=1012 tid="139638404534272" timestamp=1744925875
+Apr 17 14:37:57 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:57 | 200 |  1.562747591s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:37:57 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:57 | 200 |  198.065407ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:37:57 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:57 | 200 |  196.614327ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:37:58 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:58 | 200 |  792.646508ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:37:58 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:58 | 200 |  116.488537ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:37:58 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:58 | 200 |  197.848106ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:37:59 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:37:59 | 200 |  1.083108318s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:38:06 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:38:06 | 200 |  6.245748834s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:40:36 launchpad ollama[80427]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=2459 n_keep=24 n_left=2024 n_shift=1012 tid="139638404534272" timestamp=1744926036
+Apr 17 14:40:43 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:43 | 200 |  7.247812272s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:40:44 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:44 | 200 |  1.584068257s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:40:46 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:46 | 200 |  1.583251256s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:40:47 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:47 | 200 |  1.342984628s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:40:49 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:49 | 200 |  1.501316011s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:40:50 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:50 | 200 |  1.584905153s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:40:52 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:52 | 200 |  1.448927399s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:40:54 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:40:54 | 200 |  2.424130034s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:41:37 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:37 | 200 |  3.336877882s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:41:37 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:37 | 200 |   179.73678ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:41:37 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:37 | 200 |  179.144489ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:41:38 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:38 | 200 |  906.964389ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:41:38 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:38 | 200 |   97.826816ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:41:38 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:38 | 200 |  177.931558ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:41:39 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:39 | 200 |  1.164327037s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:41:44 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:41:44 | 200 |  5.028012933s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:42:47 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:47 | 200 |  909.093326ms |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:42:48 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:48 | 200 |  177.833467ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:42:48 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:48 | 200 |  178.264789ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:42:49 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:49 | 200 |  925.001297ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:42:49 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:49 | 200 |   94.540628ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:42:49 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:49 | 200 |  177.681184ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:42:50 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:50 | 200 |  1.290448752s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:42:54 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:42:54 | 200 |  4.047281252s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:44:53 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:53 | 200 |  1.442522939s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:44:54 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:54 | 200 |  184.355254ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:44:54 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:54 | 200 |  181.365681ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:44:55 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:55 | 200 |  913.135584ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:44:55 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:55 | 200 |  101.320559ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:44:55 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:55 | 200 |  179.596933ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:44:56 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:44:56 | 200 |  1.107232509s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:45:03 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:45:03 | 200 |   6.44463933s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:47:13 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:13 | 200 |  992.647304ms |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:47:13 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:13 | 200 |  198.583609ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:47:13 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:13 | 200 |  198.038833ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:47:14 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:14 | 200 |  1.149953098s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:47:15 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:15 | 200 |  115.614971ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:47:15 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:15 | 200 |  196.497174ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:47:16 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:16 | 200 |  1.049435549s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:47:20 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:47:20 | 200 |  4.464160932s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:49:00 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:00 | 200 |  1.985255122s |       127.0.0.1 | POST     "/api/chat"
+Apr 17 14:49:11 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:11 | 200 |  200.712298ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:49:11 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:11 | 200 |  201.168477ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:49:12 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:12 | 200 |  816.529433ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:49:12 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:12 | 200 |  115.629762ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:49:12 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:12 | 200 |  198.188649ms |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:49:13 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:13 | 200 |  1.292137545s |       127.0.0.1 | POST     "/api/embed"
+Apr 17 14:49:18 launchpad ollama[1550]: [GIN] 2025/04/17 - 14:49:18 | 200 |  4.963518908s |       127.0.0.1 | POST     "/api/chat"
+Apr 18 16:07:49 launchpad ollama[1550]: [GIN] 2025/04/18 - 16:07:49 | 200 |     916.824µs |       127.0.0.1 | GET      "/api/tags"
+Apr 18 16:07:49 launchpad ollama[1550]: [GIN] 2025/04/18 - 16:07:49 | 200 |      26.637µs |       127.0.0.1 | GET      "/api/version"
+Apr 19 11:49:21 launchpad ollama[1550]: [GIN] 2025/04/19 - 11:49:21 | 200 |     616.613µs |       127.0.0.1 | GET      "/api/tags"
+Apr 19 11:49:21 launchpad ollama[1550]: [GIN] 2025/04/19 - 11:49:21 | 200 |      29.302µs |       127.0.0.1 | GET      "/api/version"
+Apr 19 11:49:24 launchpad ollama[1550]: [GIN] 2025/04/19 - 11:49:24 | 200 |      24.758µs |       127.0.0.1 | GET      "/api/version"
+Apr 19 11:53:35 launchpad ollama[1550]: time=2025-04-19T11:53:35.878-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.046-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.046-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.047-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 43649"
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.047-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.047-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.048-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 19 11:53:36 launchpad ollama[286489]: INFO [main] build info | build=0 commit="unknown" tid="139631567757312" timestamp=1745088816
+Apr 19 11:53:36 launchpad ollama[286489]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139631567757312" timestamp=1745088816 total_threads=16
+Apr 19 11:53:36 launchpad ollama[286489]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43649" tid="139631567757312" timestamp=1745088816
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 19 11:53:36 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 19 11:53:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 19 11:53:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 19 11:53:36 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 19 11:53:36 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 19 11:53:36 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 19 11:53:36 launchpad ollama[1550]: time=2025-04-19T11:53:36.299-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 19 11:53:43 launchpad ollama[1550]: llm_load_tensors: offloading 37 repeating layers to GPU
+Apr 19 11:53:43 launchpad ollama[1550]: llm_load_tensors: offloaded 37/41 layers to GPU
+Apr 19 11:53:43 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 19 11:53:43 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 19 11:53:44 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 19 11:53:44 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 37
+Apr 19 11:53:44 launchpad ollama[286489]: INFO [main] model loaded | tid="139631567757312" timestamp=1745088824
+Apr 19 11:53:44 launchpad ollama[1550]: time=2025-04-19T11:53:44.572-07:00 level=INFO source=server.go:626 msg="llama runner started in 8.52 seconds"
+Apr 19 11:53:54 launchpad ollama[1550]: [GIN] 2025/04/19 - 11:53:54 | 200 | 18.760340136s |       127.0.0.1 | POST     "/api/chat"
+Apr 19 11:53:54 launchpad ollama[1550]: time=2025-04-19T11:53:54.692-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 19 11:53:55 launchpad ollama[1550]: [GIN] 2025/04/19 - 11:53:55 | 200 |  1.040315847s |       127.0.0.1 | POST     "/api/chat"
+Apr 19 11:53:55 launchpad ollama[1550]: time=2025-04-19T11:53:55.764-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 19 11:53:58 launchpad ollama[1550]: [GIN] 2025/04/19 - 11:53:58 | 200 |  2.668759449s |       127.0.0.1 | POST     "/api/chat"
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.063-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.225-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.5 GiB" free_swap="68.9 GiB"
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.225-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.226-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 45227"
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.226-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.226-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.226-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 20 09:02:31 launchpad ollama[360999]: INFO [main] build info | build=0 commit="unknown" tid="139967782432768" timestamp=1745164951
+Apr 20 09:02:31 launchpad ollama[360999]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139967782432768" timestamp=1745164951 total_threads=16
+Apr 20 09:02:31 launchpad ollama[360999]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45227" tid="139967782432768" timestamp=1745164951
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 20 09:02:31 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 20 09:02:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 20 09:02:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 20 09:02:31 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 20 09:02:31 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 20 09:02:31 launchpad ollama[1550]: time=2025-04-20T09:02:31.532-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_tensors: offloading 37 repeating layers to GPU
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_tensors: offloaded 37/41 layers to GPU
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 20 09:02:31 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 20 09:02:32 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 20 09:02:32 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 37
+Apr 20 09:02:32 launchpad ollama[360999]: INFO [main] model loaded | tid="139967782432768" timestamp=1745164952
+Apr 20 09:02:32 launchpad ollama[1550]: time=2025-04-20T09:02:32.535-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 20 09:02:51 launchpad ollama[1550]: [GIN] 2025/04/20 - 09:02:51 | 200 | 20.476372249s |       127.0.0.1 | POST     "/api/chat"
+Apr 20 16:27:22 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:27:22 | 200 |      733.04µs |       127.0.0.1 | GET      "/api/tags"
+Apr 20 16:27:22 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:27:22 | 200 |      31.489µs |       127.0.0.1 | GET      "/api/version"
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.513-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.673-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.673-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.674-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 32789"
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.674-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.674-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.675-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 20 16:29:50 launchpad ollama[439938]: INFO [main] build info | build=0 commit="unknown" tid="140632346476544" timestamp=1745191790
+Apr 20 16:29:50 launchpad ollama[439938]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140632346476544" timestamp=1745191790 total_threads=16
+Apr 20 16:29:50 launchpad ollama[439938]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32789" tid="140632346476544" timestamp=1745191790
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 20 16:29:50 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 20 16:29:50 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 20 16:29:50 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 20 16:29:50 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 20 16:29:50 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 20 16:29:50 launchpad ollama[1550]: time=2025-04-20T16:29:50.980-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_tensors: offloading 37 repeating layers to GPU
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_tensors: offloaded 37/41 layers to GPU
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 20 16:29:50 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 20 16:29:51 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 20 16:29:51 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 37
+Apr 20 16:29:51 launchpad ollama[439938]: INFO [main] model loaded | tid="140632346476544" timestamp=1745191791
+Apr 20 16:29:51 launchpad ollama[1550]: time=2025-04-20T16:29:51.984-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 20 16:30:12 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:30:12 | 200 | 22.480292747s |       127.0.0.1 | POST     "/api/chat"
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.482-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.638-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.639-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.640-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 35699"
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.640-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.640-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.640-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 20 16:40:47 launchpad ollama[449214]: INFO [main] build info | build=0 commit="unknown" tid="140354029445120" timestamp=1745192447
+Apr 20 16:40:47 launchpad ollama[449214]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140354029445120" timestamp=1745192447 total_threads=16
+Apr 20 16:40:47 launchpad ollama[449214]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35699" tid="140354029445120" timestamp=1745192447
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 20 16:40:47 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 20 16:40:47 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 20 16:40:47 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 20 16:40:47 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 20 16:40:47 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 20 16:40:47 launchpad ollama[1550]: time=2025-04-20T16:40:47.939-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_tensors: offloading 37 repeating layers to GPU
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_tensors: offloaded 37/41 layers to GPU
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 20 16:40:47 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 20 16:40:48 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 20 16:40:48 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 37
+Apr 20 16:40:48 launchpad ollama[449214]: INFO [main] model loaded | tid="140354029445120" timestamp=1745192448
+Apr 20 16:40:48 launchpad ollama[1550]: time=2025-04-20T16:40:48.942-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 20 16:41:03 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:41:03 | 200 |  16.09114742s |       127.0.0.1 | POST     "/api/chat"
+Apr 20 16:42:50 launchpad ollama[1550]: time=2025-04-20T16:42:50.598-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 20 16:42:59 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:42:59 | 200 |  8.623545005s |       127.0.0.1 | POST     "/api/chat"
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.639-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.797-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.2 GiB" free_swap="68.9 GiB"
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.797-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=37 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.5 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.5 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.798-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 37 --parallel 1 --port 42229"
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.798-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.798-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 20 16:50:46 launchpad ollama[1550]: time=2025-04-20T16:50:46.798-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 20 16:50:46 launchpad ollama[457113]: INFO [main] build info | build=0 commit="unknown" tid="139952802324480" timestamp=1745193046
+Apr 20 16:50:46 launchpad ollama[457113]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139952802324480" timestamp=1745193046 total_threads=16
+Apr 20 16:50:46 launchpad ollama[457113]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42229" tid="139952802324480" timestamp=1745193046
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 20 16:50:46 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 20 16:50:46 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 20 16:50:46 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 20 16:50:46 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 20 16:50:46 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 20 16:50:46 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: time=2025-04-20T16:50:47.096-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 20 16:50:47 launchpad ollama[1550]: llm_load_tensors: offloading 37 repeating layers to GPU
+Apr 20 16:50:47 launchpad ollama[1550]: llm_load_tensors: offloaded 37/41 layers to GPU
+Apr 20 16:50:47 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6297.23 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 20 16:50:47 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   120.00 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1480.00 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 20 16:50:47 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 37
+Apr 20 16:50:47 launchpad ollama[457113]: INFO [main] model loaded | tid="139952802324480" timestamp=1745193047
+Apr 20 16:50:48 launchpad ollama[1550]: time=2025-04-20T16:50:48.100-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 20 16:51:11 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:51:11 | 200 | 24.693625096s |       127.0.0.1 | POST     "/api/chat"
+Apr 20 16:52:28 launchpad ollama[1550]: time=2025-04-20T16:52:28.274-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 20 16:52:46 launchpad ollama[1550]: [GIN] 2025/04/20 - 16:52:46 | 200 | 18.600588561s |       127.0.0.1 | POST     "/api/chat"
+Apr 21 10:25:40 launchpad ollama[1550]: [GIN] 2025/04/21 - 10:25:40 | 200 |     650.481µs |       127.0.0.1 | GET      "/api/tags"
+Apr 21 10:25:40 launchpad ollama[1550]: [GIN] 2025/04/21 - 10:25:40 | 200 |      38.786µs |       127.0.0.1 | GET      "/api/version"
+Apr 21 10:59:13 launchpad ollama[1550]: [GIN] 2025/04/21 - 10:59:13 | 200 |     702.398µs |       127.0.0.1 | GET      "/api/tags"
+Apr 21 10:59:13 launchpad ollama[1550]: [GIN] 2025/04/21 - 10:59:13 | 200 |      31.923µs |       127.0.0.1 | GET      "/api/version"
+Apr 21 12:20:05 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:20:05 | 200 |       28.22µs |       127.0.0.1 | GET      "/api/version"
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.880-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9231532032 required="6.2 GiB"
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.880-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.4 GiB" free_swap="68.9 GiB"
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.880-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.881-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46503"
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.882-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.882-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 21 12:20:27 launchpad ollama[1550]: time=2025-04-21T12:20:27.882-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 21 12:20:27 launchpad ollama[520568]: INFO [main] build info | build=0 commit="unknown" tid="139654903910400" timestamp=1745263227
+Apr 21 12:20:27 launchpad ollama[520568]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139654903910400" timestamp=1745263227 total_threads=16
+Apr 21 12:20:27 launchpad ollama[520568]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46503" tid="139654903910400" timestamp=1745263227
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 21 12:20:27 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 21 12:20:28 launchpad ollama[1550]: time=2025-04-21T12:20:28.132-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 21 12:20:28 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 21 12:20:28 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 21 12:20:28 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 21 12:20:28 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 21 12:20:28 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 21 12:20:28 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 21 12:20:28 launchpad ollama[520568]: INFO [main] model loaded | tid="139654903910400" timestamp=1745263228
+Apr 21 12:20:29 launchpad ollama[1550]: time=2025-04-21T12:20:29.136-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 21 12:20:29 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:20:29 | 200 |  1.926670109s |       127.0.0.1 | POST     "/api/embed"
+Apr 21 12:20:30 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:20:30 | 200 |  484.286454ms |       127.0.0.1 | POST     "/api/embed"
+Apr 21 12:20:30 launchpad ollama[1550]: time=2025-04-21T12:20:30.314-07:00 level=INFO source=sched.go:507 msg="updated VRAM based on existing loaded models" gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e library=cuda total="11.6 GiB" available="2.8 GiB"
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.014-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9232056320 required="6.5 GiB"
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.014-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="55.4 GiB" free_swap="68.9 GiB"
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.014-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.5 GiB" memory.required.partial="6.5 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.5 GiB]" memory.weights.total="4.9 GiB" memory.weights.repeating="4.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.015-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38327"
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.016-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.016-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.016-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 21 12:20:31 launchpad ollama[520598]: INFO [main] build info | build=0 commit="unknown" tid="140185823301632" timestamp=1745263231
+Apr 21 12:20:31 launchpad ollama[520598]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140185823301632" timestamp=1745263231 total_threads=16
+Apr 21 12:20:31 launchpad ollama[520598]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38327" tid="140185823301632" timestamp=1745263231
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from /var/lib/ollama/models/blobs/sha256-6340dc3229b0d08ea9cc49b75d4098702983e17b4c096d57afbbf2ffc813f2be (version GGUF V3 (latest))
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.type str              = model
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   4:                         general.size_label str              = 8B
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   5:                          llama.block_count u32              = 32
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   7:                     llama.embedding_length u32              = 4096
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 14336
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 32
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  10:              llama.attention.head_count_kv u32              = 8
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  11:                       llama.rope.freq_base f32              = 500000.000000
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  12:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  13:                          general.file_type u32              = 15
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  14:                           llama.vocab_size u32              = 128256
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  15:                 llama.rope.dimension_count u32              = 128
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  16:                       tokenizer.ggml.model str              = gpt2
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  17:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  18:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  19:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  20:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  21:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  22:                tokenizer.ggml.eos_token_id u32              = 128001
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  23:            tokenizer.ggml.padding_token_id u32              = 128001
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  24:               tokenizer.ggml.add_bos_token bool             = true
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  25:               tokenizer.ggml.add_eos_token bool             = false
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  26:                    tokenizer.chat_template str              = {% if not add_generation_prompt is de...
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - kv  27:               general.quantization_version u32              = 2
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - type  f32:   66 tensors
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - type q4_K:  193 tensors
+Apr 21 12:20:31 launchpad ollama[1550]: llama_model_loader: - type q6_K:   33 tensors
+Apr 21 12:20:31 launchpad ollama[1550]: time=2025-04-21T12:20:31.267-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.7999 MB
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 131072
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 131072
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_K - Medium
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW)
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: general.name     = DeepSeek R1 Distill Llama 8B
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin▁of▁sentence|>'
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128001 '<|end▁of▁sentence|>'
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: PAD token        = 128001 '<|end▁of▁sentence|>'
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 21 12:20:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 21 12:20:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 21 12:20:31 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 21 12:20:31 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 21 12:20:31 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 21 12:20:35 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 21 12:20:35 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 21 12:20:35 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 21 12:20:35 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 21 12:20:35 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4403.50 MiB
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 21 12:20:36 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 21 12:20:36 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 21 12:20:36 launchpad ollama[520598]: INFO [main] model loaded | tid="140185823301632" timestamp=1745263236
+Apr 21 12:20:36 launchpad ollama[1550]: time=2025-04-21T12:20:36.532-07:00 level=INFO source=server.go:626 msg="llama runner started in 5.52 seconds"
+Apr 21 12:20:51 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:20:51 | 200 | 21.384301145s |       127.0.0.1 | POST     "/api/chat"
+Apr 21 12:20:53 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:20:53 | 200 |  1.694659971s |       127.0.0.1 | POST     "/api/chat"
+Apr 21 12:20:55 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:20:55 | 200 |    1.7042573s |       127.0.0.1 | POST     "/api/chat"
+Apr 21 12:22:19 launchpad ollama[1550]: [GIN] 2025/04/21 - 12:22:19 | 200 |      26.357µs |       127.0.0.1 | GET      "/api/version"
+Apr 22 16:53:45 launchpad ollama[1550]: [GIN] 2025/04/22 - 16:53:45 | 200 |     624.913µs |       127.0.0.1 | GET      "/api/tags"
+Apr 22 16:53:45 launchpad ollama[1550]: [GIN] 2025/04/22 - 16:53:45 | 200 |      36.386µs |       127.0.0.1 | GET      "/api/version"
+Apr 22 16:53:53 launchpad ollama[1550]: [GIN] 2025/04/22 - 16:53:53 | 200 |      25.526µs |       127.0.0.1 | GET      "/api/version"
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.803-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8910471168 required="6.2 GiB"
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.803-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.2 GiB" free_swap="68.9 GiB"
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.803-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.804-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39735"
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.804-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.804-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 16:54:56 launchpad ollama[1550]: time=2025-04-22T16:54:56.805-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 16:54:56 launchpad ollama[662024]: INFO [main] build info | build=0 commit="unknown" tid="140233234300928" timestamp=1745366096
+Apr 22 16:54:56 launchpad ollama[662024]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140233234300928" timestamp=1745366096 total_threads=16
+Apr 22 16:54:56 launchpad ollama[662024]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39735" tid="140233234300928" timestamp=1745366096
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 16:54:56 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 16:54:57 launchpad ollama[1550]: time=2025-04-22T16:54:57.056-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 16:54:57 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 16:54:57 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 16:54:57 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 16:54:57 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 16:54:57 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 16:54:57 launchpad ollama[662024]: INFO [main] model loaded | tid="140233234300928" timestamp=1745366097
+Apr 22 16:54:58 launchpad ollama[1550]: time=2025-04-22T16:54:58.059-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 16:54:59 launchpad ollama[1550]: [GIN] 2025/04/22 - 16:54:59 | 200 |  3.133858652s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 16:55:00 launchpad ollama[1550]: [GIN] 2025/04/22 - 16:55:00 | 200 |   309.42198ms |       127.0.0.1 | POST     "/api/chat"
+Apr 22 16:55:01 launchpad ollama[1550]: [GIN] 2025/04/22 - 16:55:01 | 200 |  1.346155592s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.227-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9045213184 required="6.2 GiB"
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.227-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.1 GiB" free_swap="68.9 GiB"
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.227-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.228-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35675"
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.228-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.228-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.229-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 17:04:24 launchpad ollama[663467]: INFO [main] build info | build=0 commit="unknown" tid="140574638034944" timestamp=1745366664
+Apr 22 17:04:24 launchpad ollama[663467]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140574638034944" timestamp=1745366664 total_threads=16
+Apr 22 17:04:24 launchpad ollama[663467]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35675" tid="140574638034944" timestamp=1745366664
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 17:04:24 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 17:04:24 launchpad ollama[1550]: time=2025-04-22T17:04:24.480-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 17:04:24 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 17:04:24 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 17:04:24 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 17:04:24 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 17:04:24 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 17:04:25 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 17:04:25 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 17:04:25 launchpad ollama[663467]: INFO [main] model loaded | tid="140574638034944" timestamp=1745366665
+Apr 22 17:04:25 launchpad ollama[1550]: time=2025-04-22T17:04:25.483-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 17:04:35 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:04:35 | 200 | 11.442816882s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:07:21 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:07:21 | 200 |  6.018450452s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:09:40 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:09:40 | 200 |   8.79640683s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.304-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9008644096 required="6.2 GiB"
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.304-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.304-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.305-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38425"
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.305-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.305-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.305-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 17:16:27 launchpad ollama[665429]: INFO [main] build info | build=0 commit="unknown" tid="140393734627328" timestamp=1745367387
+Apr 22 17:16:27 launchpad ollama[665429]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140393734627328" timestamp=1745367387 total_threads=16
+Apr 22 17:16:27 launchpad ollama[665429]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38425" tid="140393734627328" timestamp=1745367387
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 17:16:27 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 17:16:27 launchpad ollama[1550]: time=2025-04-22T17:16:27.557-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 17:16:27 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 17:16:27 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 17:16:27 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 17:16:27 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 17:16:27 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 17:16:28 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 17:16:28 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 17:16:28 launchpad ollama[665429]: INFO [main] model loaded | tid="140393734627328" timestamp=1745367388
+Apr 22 17:16:28 launchpad ollama[1550]: time=2025-04-22T17:16:28.560-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 22 17:16:36 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:16:36 | 200 |  9.136432762s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.594-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8955559936 required="6.2 GiB"
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.594-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.1 GiB" free_swap="68.9 GiB"
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.594-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.595-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44483"
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.595-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.595-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.596-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 17:22:37 launchpad ollama[666377]: INFO [main] build info | build=0 commit="unknown" tid="139861360054272" timestamp=1745367757
+Apr 22 17:22:37 launchpad ollama[666377]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139861360054272" timestamp=1745367757 total_threads=16
+Apr 22 17:22:37 launchpad ollama[666377]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44483" tid="139861360054272" timestamp=1745367757
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 17:22:37 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 17:22:37 launchpad ollama[1550]: time=2025-04-22T17:22:37.846-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 17:22:37 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 17:22:37 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 17:22:37 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 17:22:37 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 17:22:37 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 17:22:38 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 17:22:38 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 17:22:38 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 17:22:38 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 17:22:38 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 17:22:38 launchpad ollama[666377]: INFO [main] model loaded | tid="139861360054272" timestamp=1745367758
+Apr 22 17:22:38 launchpad ollama[1550]: time=2025-04-22T17:22:38.851-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 22 17:22:47 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:22:47 | 200 |  9.620504572s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:27:19 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:27:19 | 200 |  6.365838487s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:32:07 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:32:07 | 200 |  8.012798901s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:34:22 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:34:22 | 200 |  9.885575052s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:37:27 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:37:27 | 200 | 11.185556841s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.490-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8993046528 required="6.2 GiB"
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.490-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.8 GiB" free_swap="68.9 GiB"
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.490-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.491-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41491"
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.491-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.491-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.492-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 17:43:40 launchpad ollama[669773]: INFO [main] build info | build=0 commit="unknown" tid="140105416462336" timestamp=1745369020
+Apr 22 17:43:40 launchpad ollama[669773]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140105416462336" timestamp=1745369020 total_threads=16
+Apr 22 17:43:40 launchpad ollama[669773]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41491" tid="140105416462336" timestamp=1745369020
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 17:43:40 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 17:43:40 launchpad ollama[1550]: time=2025-04-22T17:43:40.742-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 17:43:40 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 17:43:40 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 17:43:40 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 17:43:40 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 17:43:40 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 17:43:41 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 17:43:41 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 17:43:41 launchpad ollama[669773]: INFO [main] model loaded | tid="140105416462336" timestamp=1745369021
+Apr 22 17:43:41 launchpad ollama[1550]: time=2025-04-22T17:43:41.746-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 17:43:53 launchpad ollama[1550]: [GIN] 2025/04/22 - 17:43:53 | 200 | 13.312830298s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.642-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9031450624 required="6.2 GiB"
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.642-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.8 GiB" free_swap="68.9 GiB"
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.643-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.644-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41395"
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.644-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.644-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.644-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 18:06:00 launchpad ollama[673206]: INFO [main] build info | build=0 commit="unknown" tid="140079229906944" timestamp=1745370360
+Apr 22 18:06:00 launchpad ollama[673206]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140079229906944" timestamp=1745370360 total_threads=16
+Apr 22 18:06:00 launchpad ollama[673206]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41395" tid="140079229906944" timestamp=1745370360
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 18:06:00 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 18:06:00 launchpad ollama[1550]: time=2025-04-22T18:06:00.895-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 18:06:00 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 18:06:00 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 18:06:00 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 18:06:00 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 18:06:00 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 18:06:01 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 18:06:01 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 18:06:01 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 18:06:01 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 18:06:01 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 18:06:01 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 18:06:01 launchpad ollama[673206]: INFO [main] model loaded | tid="140079229906944" timestamp=1745370361
+Apr 22 18:06:01 launchpad ollama[1550]: time=2025-04-22T18:06:01.898-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 18:06:13 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:06:13 | 200 | 13.014326109s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.602-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8910274560 required="6.2 GiB"
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.602-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.9 GiB" free_swap="68.9 GiB"
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.602-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.603-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38957"
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.603-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.604-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.604-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 18:17:23 launchpad ollama[674998]: INFO [main] build info | build=0 commit="unknown" tid="140297879146496" timestamp=1745371043
+Apr 22 18:17:23 launchpad ollama[674998]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140297879146496" timestamp=1745371043 total_threads=16
+Apr 22 18:17:23 launchpad ollama[674998]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38957" tid="140297879146496" timestamp=1745371043
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 18:17:23 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 18:17:23 launchpad ollama[1550]: time=2025-04-22T18:17:23.855-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 18:17:23 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 18:17:23 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 18:17:23 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 18:17:23 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 18:17:23 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 18:17:24 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 18:17:24 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 18:17:24 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 18:17:24 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 18:17:24 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 18:17:24 launchpad ollama[674998]: INFO [main] model loaded | tid="140297879146496" timestamp=1745371044
+Apr 22 18:17:24 launchpad ollama[1550]: time=2025-04-22T18:17:24.858-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 18:17:37 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:17:37 | 200 | 14.234576356s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.529-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8985640960 required="6.2 GiB"
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.529-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.9 GiB" free_swap="68.9 GiB"
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.529-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.530-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38355"
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.530-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.530-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.530-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 18:25:20 launchpad ollama[676233]: INFO [main] build info | build=0 commit="unknown" tid="140178250317824" timestamp=1745371520
+Apr 22 18:25:20 launchpad ollama[676233]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140178250317824" timestamp=1745371520 total_threads=16
+Apr 22 18:25:20 launchpad ollama[676233]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38355" tid="140178250317824" timestamp=1745371520
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 18:25:20 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 18:25:20 launchpad ollama[1550]: time=2025-04-22T18:25:20.781-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 18:25:20 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 18:25:20 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 18:25:20 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 18:25:20 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 18:25:20 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 18:25:21 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 18:25:21 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 18:25:21 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 18:25:21 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 18:25:21 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 18:25:21 launchpad ollama[676233]: INFO [main] model loaded | tid="140178250317824" timestamp=1745371521
+Apr 22 18:25:21 launchpad ollama[1550]: time=2025-04-22T18:25:21.785-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 22 18:25:27 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:25:27 | 200 |  7.044238455s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:27:46 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:27:46 | 200 |   5.82328305s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.993-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8987344896 required="6.2 GiB"
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.993-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.9 GiB" free_swap="68.9 GiB"
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.993-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.994-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42383"
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.994-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.994-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 18:33:21 launchpad ollama[1550]: time=2025-04-22T18:33:21.995-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 18:33:22 launchpad ollama[677442]: INFO [main] build info | build=0 commit="unknown" tid="140251518558208" timestamp=1745372002
+Apr 22 18:33:22 launchpad ollama[677442]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140251518558208" timestamp=1745372002 total_threads=16
+Apr 22 18:33:22 launchpad ollama[677442]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42383" tid="140251518558208" timestamp=1745372002
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 18:33:22 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 18:33:22 launchpad ollama[1550]: time=2025-04-22T18:33:22.246-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 18:33:22 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 18:33:22 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 18:33:22 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 18:33:22 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 18:33:22 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 18:33:22 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 18:33:23 launchpad ollama[677442]: INFO [main] model loaded | tid="140251518558208" timestamp=1745372003
+Apr 22 18:33:23 launchpad ollama[1550]: time=2025-04-22T18:33:23.249-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 18:33:29 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:33:29 | 200 |  7.950536678s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:34:59 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:34:59 | 200 |  3.708373562s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:36:15 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:36:15 | 200 |  4.578915411s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:37:25 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:37:25 | 200 |  3.434990383s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:39:24 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:39:24 | 200 |  4.592553143s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.264-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9071296512 required="6.2 GiB"
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.264-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.8 GiB" free_swap="68.9 GiB"
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.265-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.265-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45871"
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.266-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.266-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.266-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 22 18:55:41 launchpad ollama[680809]: INFO [main] build info | build=0 commit="unknown" tid="139642144657408" timestamp=1745373341
+Apr 22 18:55:41 launchpad ollama[680809]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139642144657408" timestamp=1745373341 total_threads=16
+Apr 22 18:55:41 launchpad ollama[680809]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45871" tid="139642144657408" timestamp=1745373341
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 22 18:55:41 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 22 18:55:41 launchpad ollama[1550]: time=2025-04-22T18:55:41.517-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 22 18:55:41 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 22 18:55:41 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 22 18:55:41 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 22 18:55:41 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 22 18:55:41 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 22 18:55:42 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 22 18:55:42 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 22 18:55:42 launchpad ollama[680809]: INFO [main] model loaded | tid="139642144657408" timestamp=1745373342
+Apr 22 18:55:42 launchpad ollama[1550]: time=2025-04-22T18:55:42.521-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 22 18:55:49 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:55:49 | 200 |   8.29654315s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 18:58:29 launchpad ollama[1550]: [GIN] 2025/04/22 - 18:58:29 | 200 |  6.561886447s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 19:00:50 launchpad ollama[1550]: [GIN] 2025/04/22 - 19:00:50 | 200 |  7.304756304s |       127.0.0.1 | POST     "/api/chat"
+Apr 22 19:04:27 launchpad ollama[1550]: [GIN] 2025/04/22 - 19:04:27 | 200 |  6.741603191s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:20:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:20:54 | 200 |     640.676µs |       127.0.0.1 | GET      "/api/tags"
+Apr 23 11:20:55 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:20:55 | 200 |      24.144µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 11:20:58 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:20:58 | 200 |      36.017µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.497-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9196929024 required="6.2 GiB"
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.497-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.1 GiB" free_swap="68.9 GiB"
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.498-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.499-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43495"
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.499-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.499-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.499-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 11:23:26 launchpad ollama[704084]: INFO [main] build info | build=0 commit="unknown" tid="140569704751104" timestamp=1745432606
+Apr 23 11:23:26 launchpad ollama[704084]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140569704751104" timestamp=1745432606 total_threads=16
+Apr 23 11:23:26 launchpad ollama[704084]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43495" tid="140569704751104" timestamp=1745432606
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 11:23:26 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 11:23:26 launchpad ollama[1550]: time=2025-04-23T11:23:26.750-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 11:23:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 11:23:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 11:23:26 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 11:23:26 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 11:23:26 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 11:23:27 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 11:23:27 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 11:23:27 launchpad ollama[704084]: INFO [main] model loaded | tid="140569704751104" timestamp=1745432607
+Apr 23 11:23:27 launchpad ollama[1550]: time=2025-04-23T11:23:27.754-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 11:23:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:23:33 | 200 |  7.387660865s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:23:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:23:34 | 200 |  1.187274349s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:23:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:23:37 | 200 |  2.783100881s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.460-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9211609088 required="6.2 GiB"
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.460-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.1 GiB" free_swap="68.9 GiB"
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.460-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.461-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38373"
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.462-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.462-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.462-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 11:29:12 launchpad ollama[704979]: INFO [main] build info | build=0 commit="unknown" tid="140119599300608" timestamp=1745432952
+Apr 23 11:29:12 launchpad ollama[704979]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140119599300608" timestamp=1745432952 total_threads=16
+Apr 23 11:29:12 launchpad ollama[704979]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38373" tid="140119599300608" timestamp=1745432952
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 11:29:12 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 11:29:12 launchpad ollama[1550]: time=2025-04-23T11:29:12.712-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 11:29:12 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 11:29:12 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 11:29:12 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 11:29:12 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 11:29:12 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 11:29:13 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 11:29:13 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 11:29:13 launchpad ollama[704979]: INFO [main] model loaded | tid="140119599300608" timestamp=1745432953
+Apr 23 11:29:13 launchpad ollama[1550]: time=2025-04-23T11:29:13.716-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 11:29:19 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:29:19 | 200 |  7.602748082s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:31:01 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:31:01 | 200 |  6.725284712s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.789-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.789-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.789-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.790-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34019"
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.790-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.790-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 11:38:35 launchpad ollama[1550]: time=2025-04-23T11:38:35.790-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 11:38:35 launchpad ollama[706422]: INFO [main] build info | build=0 commit="unknown" tid="140606098849792" timestamp=1745433515
+Apr 23 11:38:35 launchpad ollama[706422]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140606098849792" timestamp=1745433515 total_threads=16
+Apr 23 11:38:35 launchpad ollama[706422]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34019" tid="140606098849792" timestamp=1745433515
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 11:38:35 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 11:38:36 launchpad ollama[1550]: time=2025-04-23T11:38:36.041-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 11:38:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 11:38:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 11:38:36 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 11:38:36 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 11:38:36 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 11:38:36 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 11:38:36 launchpad ollama[706422]: INFO [main] model loaded | tid="140606098849792" timestamp=1745433516
+Apr 23 11:38:37 launchpad ollama[1550]: time=2025-04-23T11:38:37.045-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 11:38:45 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:38:45 | 200 | 10.146176166s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.477-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.477-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.477-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.478-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36333"
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.478-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.478-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.478-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 11:50:26 launchpad ollama[708177]: INFO [main] build info | build=0 commit="unknown" tid="140246380457984" timestamp=1745434226
+Apr 23 11:50:26 launchpad ollama[708177]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140246380457984" timestamp=1745434226 total_threads=16
+Apr 23 11:50:26 launchpad ollama[708177]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36333" tid="140246380457984" timestamp=1745434226
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 11:50:26 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 11:50:26 launchpad ollama[1550]: time=2025-04-23T11:50:26.729-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 11:50:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 11:50:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 11:50:26 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 11:50:26 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 11:50:26 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 11:50:27 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 11:50:27 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 11:50:27 launchpad ollama[708177]: INFO [main] model loaded | tid="140246380457984" timestamp=1745434227
+Apr 23 11:50:27 launchpad ollama[1550]: time=2025-04-23T11:50:27.732-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 11:50:36 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:50:36 | 200 |  9.728679535s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.390-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297723392 required="6.2 GiB"
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.390-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.390-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.391-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34477"
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.391-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.391-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.391-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 11:58:22 launchpad ollama[709417]: INFO [main] build info | build=0 commit="unknown" tid="140169853362176" timestamp=1745434702
+Apr 23 11:58:22 launchpad ollama[709417]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140169853362176" timestamp=1745434702 total_threads=16
+Apr 23 11:58:22 launchpad ollama[709417]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34477" tid="140169853362176" timestamp=1745434702
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 11:58:22 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 11:58:22 launchpad ollama[1550]: time=2025-04-23T11:58:22.642-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 11:58:22 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 11:58:22 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 11:58:22 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 11:58:22 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 11:58:22 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 11:58:23 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 11:58:23 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 11:58:23 launchpad ollama[709417]: INFO [main] model loaded | tid="140169853362176" timestamp=1745434703
+Apr 23 11:58:23 launchpad ollama[1550]: time=2025-04-23T11:58:23.645-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 11:58:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 11:58:33 | 200 | 11.194826934s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:01:24 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:01:24 | 200 |   7.31735944s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.137-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9295101952 required="6.2 GiB"
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.137-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="54.0 GiB" free_swap="68.9 GiB"
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.137-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.138-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44523"
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.138-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.138-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.138-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 12:11:51 launchpad ollama[711454]: INFO [main] build info | build=0 commit="unknown" tid="140476077305856" timestamp=1745435511
+Apr 23 12:11:51 launchpad ollama[711454]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140476077305856" timestamp=1745435511 total_threads=16
+Apr 23 12:11:51 launchpad ollama[711454]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44523" tid="140476077305856" timestamp=1745435511
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 12:11:51 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 12:11:51 launchpad ollama[1550]: time=2025-04-23T12:11:51.389-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 12:11:51 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 12:11:51 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 12:11:51 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 12:11:51 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 12:11:51 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 12:11:52 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 12:11:52 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 12:11:52 launchpad ollama[711454]: INFO [main] model loaded | tid="140476077305856" timestamp=1745435512
+Apr 23 12:11:52 launchpad ollama[1550]: time=2025-04-23T12:11:52.393-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 12:11:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:11:59 | 200 |  8.701291304s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.173-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9182248960 required="6.2 GiB"
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.173-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.9 GiB" free_swap="68.9 GiB"
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.173-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.174-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38925"
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.174-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.174-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.174-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 12:31:18 launchpad ollama[714582]: INFO [main] build info | build=0 commit="unknown" tid="139797635899392" timestamp=1745436678
+Apr 23 12:31:18 launchpad ollama[714582]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139797635899392" timestamp=1745436678 total_threads=16
+Apr 23 12:31:18 launchpad ollama[714582]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38925" tid="139797635899392" timestamp=1745436678
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 12:31:18 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 12:31:18 launchpad ollama[1550]: time=2025-04-23T12:31:18.425-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 12:31:18 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 12:31:18 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 12:31:18 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 12:31:18 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 12:31:18 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 12:31:19 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 12:31:19 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 12:31:19 launchpad ollama[714582]: INFO [main] model loaded | tid="139797635899392" timestamp=1745436679
+Apr 23 12:31:19 launchpad ollama[1550]: time=2025-04-23T12:31:19.429-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 12:31:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:31:26 | 200 |  8.914395683s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.731-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9199026176 required="6.2 GiB"
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.731-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.8 GiB" free_swap="68.9 GiB"
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.731-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.732-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41947"
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.732-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.732-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.733-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 12:40:30 launchpad ollama[715978]: INFO [main] build info | build=0 commit="unknown" tid="139621129306112" timestamp=1745437230
+Apr 23 12:40:30 launchpad ollama[715978]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139621129306112" timestamp=1745437230 total_threads=16
+Apr 23 12:40:30 launchpad ollama[715978]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41947" tid="139621129306112" timestamp=1745437230
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 12:40:30 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 12:40:30 launchpad ollama[1550]: time=2025-04-23T12:40:30.983-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 12:40:30 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 12:40:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 12:40:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 12:40:31 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 12:40:31 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 12:40:31 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 12:40:31 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 12:40:31 launchpad ollama[715978]: INFO [main] model loaded | tid="139621129306112" timestamp=1745437231
+Apr 23 12:40:31 launchpad ollama[1550]: time=2025-04-23T12:40:31.986-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 12:40:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:40:39 | 200 |  8.635022503s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:42:18 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:42:18 | 200 |  9.352342256s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:44:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:44:21 | 200 |  9.578402887s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:46:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:46:15 | 200 |  8.397961733s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:50:03 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:50:03 | 200 | 11.400203648s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.098-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297723392 required="6.2 GiB"
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.098-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.098-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.099-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37239"
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.099-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.099-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.099-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 12:56:39 launchpad ollama[718453]: INFO [main] build info | build=0 commit="unknown" tid="140095530123264" timestamp=1745438199
+Apr 23 12:56:39 launchpad ollama[718453]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140095530123264" timestamp=1745438199 total_threads=16
+Apr 23 12:56:39 launchpad ollama[718453]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37239" tid="140095530123264" timestamp=1745438199
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 12:56:39 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 12:56:39 launchpad ollama[1550]: time=2025-04-23T12:56:39.351-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 12:56:39 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 12:56:39 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 12:56:39 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 12:56:39 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 12:56:39 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 12:56:40 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 12:56:40 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 12:56:40 launchpad ollama[718453]: INFO [main] model loaded | tid="140095530123264" timestamp=1745438200
+Apr 23 12:56:40 launchpad ollama[1550]: time=2025-04-23T12:56:40.355-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 12:56:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 12:56:50 | 200 | 11.124863688s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.688-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.688-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.688-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.689-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34377"
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.689-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.690-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.690-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 13:04:09 launchpad ollama[719592]: INFO [main] build info | build=0 commit="unknown" tid="139719604162560" timestamp=1745438649
+Apr 23 13:04:09 launchpad ollama[719592]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139719604162560" timestamp=1745438649 total_threads=16
+Apr 23 13:04:09 launchpad ollama[719592]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34377" tid="139719604162560" timestamp=1745438649
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 13:04:09 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 13:04:09 launchpad ollama[1550]: time=2025-04-23T13:04:09.940-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 13:04:09 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 13:04:09 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 13:04:09 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 13:04:09 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 13:04:09 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 13:04:10 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 13:04:10 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 13:04:10 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 13:04:10 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 13:04:10 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 13:04:10 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 13:04:10 launchpad ollama[719592]: INFO [main] model loaded | tid="139719604162560" timestamp=1745438650
+Apr 23 13:04:10 launchpad ollama[1550]: time=2025-04-23T13:04:10.944-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 13:04:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:04:21 | 200 | 11.869875333s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.726-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.727-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.727-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.728-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36641"
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.728-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.728-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.728-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 13:09:25 launchpad ollama[720418]: INFO [main] build info | build=0 commit="unknown" tid="139783658434560" timestamp=1745438965
+Apr 23 13:09:25 launchpad ollama[720418]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139783658434560" timestamp=1745438965 total_threads=16
+Apr 23 13:09:25 launchpad ollama[720418]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36641" tid="139783658434560" timestamp=1745438965
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 13:09:25 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 13:09:25 launchpad ollama[1550]: time=2025-04-23T13:09:25.979-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 13:09:25 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 13:09:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 13:09:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 13:09:26 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 13:09:26 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 13:09:26 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 13:09:26 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 13:09:26 launchpad ollama[720418]: INFO [main] model loaded | tid="139783658434560" timestamp=1745438966
+Apr 23 13:09:26 launchpad ollama[1550]: time=2025-04-23T13:09:26.983-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 13:09:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:09:39 | 200 | 14.102146572s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.601-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297199104 required="6.2 GiB"
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.601-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.601-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.602-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40583"
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.603-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.603-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.603-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 13:21:40 launchpad ollama[722240]: INFO [main] build info | build=0 commit="unknown" tid="139956267622400" timestamp=1745439700
+Apr 23 13:21:40 launchpad ollama[722240]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139956267622400" timestamp=1745439700 total_threads=16
+Apr 23 13:21:40 launchpad ollama[722240]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40583" tid="139956267622400" timestamp=1745439700
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 13:21:40 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 13:21:40 launchpad ollama[1550]: time=2025-04-23T13:21:40.854-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 13:21:40 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 13:21:40 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 13:21:40 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 13:21:40 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 13:21:40 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 13:21:41 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 13:21:41 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 13:21:41 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 13:21:41 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 13:21:41 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 13:21:41 launchpad ollama[722240]: INFO [main] model loaded | tid="139956267622400" timestamp=1745439701
+Apr 23 13:21:41 launchpad ollama[1550]: time=2025-04-23T13:21:41.857-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 13:21:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:21:53 | 200 | 12.953755348s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:25:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:25:52 | 200 |  8.075097905s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.326-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.326-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.6 GiB" free_swap="68.9 GiB"
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.326-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.327-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40585"
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.327-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.327-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.327-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 13:35:13 launchpad ollama[724277]: INFO [main] build info | build=0 commit="unknown" tid="140497420255232" timestamp=1745440513
+Apr 23 13:35:13 launchpad ollama[724277]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140497420255232" timestamp=1745440513 total_threads=16
+Apr 23 13:35:13 launchpad ollama[724277]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40585" tid="140497420255232" timestamp=1745440513
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 13:35:13 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 13:35:13 launchpad ollama[1550]: time=2025-04-23T13:35:13.578-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 13:35:13 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 13:35:13 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 13:35:13 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 13:35:13 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 13:35:13 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 13:35:14 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 13:35:14 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 13:35:14 launchpad ollama[724277]: INFO [main] model loaded | tid="140497420255232" timestamp=1745440514
+Apr 23 13:35:14 launchpad ollama[1550]: time=2025-04-23T13:35:14.582-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 13:35:24 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:35:24 | 200 | 11.142479482s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:37:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:37:51 | 200 | 10.230882447s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:41:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:41:43 | 200 |  9.864597151s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.913-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9190637568 required="6.2 GiB"
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.913-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.914-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.915-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44127"
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.915-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.915-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 13:53:33 launchpad ollama[1550]: time=2025-04-23T13:53:33.915-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 13:53:33 launchpad ollama[727133]: INFO [main] build info | build=0 commit="unknown" tid="139936180211712" timestamp=1745441613
+Apr 23 13:53:33 launchpad ollama[727133]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139936180211712" timestamp=1745441613 total_threads=16
+Apr 23 13:53:33 launchpad ollama[727133]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44127" tid="139936180211712" timestamp=1745441613
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 13:53:33 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 13:53:34 launchpad ollama[1550]: time=2025-04-23T13:53:34.166-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 13:53:34 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 13:53:34 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 13:53:34 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 13:53:34 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 13:53:34 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 13:53:34 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 13:53:34 launchpad ollama[727133]: INFO [main] model loaded | tid="139936180211712" timestamp=1745441614
+Apr 23 13:53:35 launchpad ollama[1550]: time=2025-04-23T13:53:35.170-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 13:53:44 launchpad ollama[1550]: [GIN] 2025/04/23 - 13:53:44 | 200 | 11.276115175s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.716-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.716-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.8 GiB" free_swap="68.9 GiB"
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.716-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.717-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 42567"
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.717-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.717-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.718-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 14:07:25 launchpad ollama[729184]: INFO [main] build info | build=0 commit="unknown" tid="139757118943232" timestamp=1745442445
+Apr 23 14:07:25 launchpad ollama[729184]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139757118943232" timestamp=1745442445 total_threads=16
+Apr 23 14:07:25 launchpad ollama[729184]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42567" tid="139757118943232" timestamp=1745442445
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 14:07:25 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 14:07:25 launchpad ollama[1550]: time=2025-04-23T14:07:25.968-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 14:07:25 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 14:07:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 14:07:26 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 14:07:26 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 14:07:26 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 14:07:26 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 14:07:26 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 14:07:26 launchpad ollama[729184]: INFO [main] model loaded | tid="139757118943232" timestamp=1745442446
+Apr 23 14:07:26 launchpad ollama[1550]: time=2025-04-23T14:07:26.972-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 14:07:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:07:35 | 200 | 10.357410818s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:10:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:10:51 | 200 | 13.678362324s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:15:25 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:15:25 | 200 | 11.600829043s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.738-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297723392 required="6.2 GiB"
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.738-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.738-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.739-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45449"
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.739-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.739-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.739-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 14:26:36 launchpad ollama[732078]: INFO [main] build info | build=0 commit="unknown" tid="140011585576960" timestamp=1745443596
+Apr 23 14:26:36 launchpad ollama[732078]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140011585576960" timestamp=1745443596 total_threads=16
+Apr 23 14:26:36 launchpad ollama[732078]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45449" tid="140011585576960" timestamp=1745443596
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 14:26:36 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 14:26:36 launchpad ollama[1550]: time=2025-04-23T14:26:36.991-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 14:26:37 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 14:26:37 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 14:26:37 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 14:26:37 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 14:26:37 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 14:26:37 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 14:26:37 launchpad ollama[732078]: INFO [main] model loaded | tid="140011585576960" timestamp=1745443597
+Apr 23 14:26:37 launchpad ollama[1550]: time=2025-04-23T14:26:37.994-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 14:26:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:26:48 | 200 | 12.380089776s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.435-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299820544 required="6.2 GiB"
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.435-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.6 GiB" free_swap="68.9 GiB"
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.436-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.436-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44737"
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.437-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.437-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.437-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 14:33:01 launchpad ollama[733036]: INFO [main] build info | build=0 commit="unknown" tid="140588807684096" timestamp=1745443981
+Apr 23 14:33:01 launchpad ollama[733036]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140588807684096" timestamp=1745443981 total_threads=16
+Apr 23 14:33:01 launchpad ollama[733036]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44737" tid="140588807684096" timestamp=1745443981
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 14:33:01 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 14:33:01 launchpad ollama[1550]: time=2025-04-23T14:33:01.688-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 14:33:01 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 14:33:01 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 14:33:01 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 14:33:01 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 14:33:01 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 14:33:02 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 14:33:02 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 14:33:02 launchpad ollama[733036]: INFO [main] model loaded | tid="140588807684096" timestamp=1745443982
+Apr 23 14:33:02 launchpad ollama[1550]: time=2025-04-23T14:33:02.692-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 14:33:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:33:11 | 200 | 10.308262643s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.895-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9200533504 required="6.2 GiB"
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.895-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.6 GiB" free_swap="68.9 GiB"
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.895-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.897-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46227"
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.897-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.897-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 14:38:39 launchpad ollama[1550]: time=2025-04-23T14:38:39.898-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 14:38:39 launchpad ollama[734094]: INFO [main] build info | build=0 commit="unknown" tid="140104576258048" timestamp=1745444319
+Apr 23 14:38:39 launchpad ollama[734094]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140104576258048" timestamp=1745444319 total_threads=16
+Apr 23 14:38:39 launchpad ollama[734094]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46227" tid="140104576258048" timestamp=1745444319
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 14:38:39 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 14:38:40 launchpad ollama[1550]: time=2025-04-23T14:38:40.149-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 14:38:40 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 14:38:40 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 14:38:40 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 14:38:40 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 14:38:40 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 14:38:40 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 14:38:40 launchpad ollama[734094]: INFO [main] model loaded | tid="140104576258048" timestamp=1745444320
+Apr 23 14:38:41 launchpad ollama[1550]: time=2025-04-23T14:38:41.152-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 14:38:45 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:38:45 | 200 |  5.917909773s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:38:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:38:50 | 200 |  4.403901904s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:38:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:38:54 | 200 |  4.295810322s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:38:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:38:54 | 200 |  463.056432ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:41:16 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:41:16 | 200 |  1.867684278s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:41:17 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:41:17 | 200 |  217.566499ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:41:17 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:41:17 | 200 |  218.140938ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:41:19 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:41:19 | 200 |  1.080943491s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:41:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:41:21 | 200 |  2.308630604s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:42:37 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3631 n_keep=24 n_left=2024 n_shift=1012 tid="140104576258048" timestamp=1745444557
+Apr 23 14:42:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:42:39 | 200 |  1.929403943s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:42:39 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="140104576258048" timestamp=1745444559
+Apr 23 14:42:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:42:40 | 200 |  1.099211505s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:42:40 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="140104576258048" timestamp=1745444560
+Apr 23 14:42:41 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:42:41 | 200 |  1.100569352s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:42:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:42:43 | 200 |  1.433242062s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:42:43 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1075 n_keep=24 n_left=2024 n_shift=1012 tid="140104576258048" timestamp=1745444563
+Apr 23 14:42:44 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:42:44 | 200 |  1.481279163s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:43:24 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1163 n_keep=24 n_left=2024 n_shift=1012 tid="140104576258048" timestamp=1745444604
+Apr 23 14:43:25 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:43:25 | 200 |  1.646370369s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:43:25 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:43:25 | 200 |  177.227665ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:43:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:43:26 | 200 |  176.892154ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:43:27 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:43:27 | 200 |  1.364166657s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:43:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:43:29 | 200 |  1.899849366s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:44:10 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:10 | 200 |  789.526405ms |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:44:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:11 | 200 |   172.84459ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:44:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:11 | 200 |  175.194876ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:44:12 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:12 | 200 |  1.259187511s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:44:14 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:14 | 200 |  1.532774908s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:44:46 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="140104576258048" timestamp=1745444686
+Apr 23 14:44:46 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:46 | 200 |  1.041423442s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:44:47 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="140104576258048" timestamp=1745444687
+Apr 23 14:44:47 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:47 | 200 |  1.003305675s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:44:49 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:49 | 200 |  1.321671086s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:44:49 launchpad ollama[734094]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1075 n_keep=24 n_left=2024 n_shift=1012 tid="140104576258048" timestamp=1745444689
+Apr 23 14:44:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:44:51 | 200 |  2.183832595s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:46:44 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:46:44 | 200 |  232.136796ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:46:44 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:46:44 | 200 |  212.298351ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:46:45 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:46:45 | 200 |  880.720113ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:46:47 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:46:47 | 200 |  2.426639671s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:51:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:51:15 | 200 |  253.139884ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:51:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:51:15 | 200 |  224.385177ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:51:16 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:51:16 | 200 |  1.057310693s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:51:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:51:21 | 200 |  4.634745846s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 14:54:25 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:54:25 | 200 |  205.010376ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:54:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:54:26 | 200 |  191.526085ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:54:27 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:54:27 | 200 |  1.061523535s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 14:54:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 14:54:33 | 200 |  5.795074344s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.182-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297133568 required="6.2 GiB"
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.182-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.6 GiB" free_swap="68.9 GiB"
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.182-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.184-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36557"
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.184-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.184-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.184-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 15:08:36 launchpad ollama[738663]: INFO [main] build info | build=0 commit="unknown" tid="140478615900160" timestamp=1745446116
+Apr 23 15:08:36 launchpad ollama[738663]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140478615900160" timestamp=1745446116 total_threads=16
+Apr 23 15:08:36 launchpad ollama[738663]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36557" tid="140478615900160" timestamp=1745446116
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 15:08:36 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 15:08:36 launchpad ollama[1550]: time=2025-04-23T15:08:36.436-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 15:08:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 15:08:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 15:08:36 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 15:08:36 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 15:08:36 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 15:08:37 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 15:08:37 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 15:08:37 launchpad ollama[738663]: INFO [main] model loaded | tid="140478615900160" timestamp=1745446117
+Apr 23 15:08:37 launchpad ollama[1550]: time=2025-04-23T15:08:37.439-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 15:08:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:37 | 200 |  1.752160476s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:08:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:38 | 200 |  316.550556ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:08:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:38 | 200 |  913.865381ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:08:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:40 | 200 |  1.524003863s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:08:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:52 | 200 |  162.847618ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:08:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:52 | 200 |  162.958158ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:08:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:53 | 200 |  1.119670421s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:08:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:08:54 | 200 |  1.321134401s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:09:22 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:09:22 | 200 |  209.634616ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:09:23 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:09:23 | 200 |  186.861717ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:09:24 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:09:24 | 200 |  1.087752103s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:09:30 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:09:30 | 200 |  6.284829003s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.591-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297657856 required="6.2 GiB"
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.591-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.5 GiB" free_swap="68.9 GiB"
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.591-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.593-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 39331"
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.593-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.593-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.593-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 15:29:05 launchpad ollama[741770]: INFO [main] build info | build=0 commit="unknown" tid="139751249104896" timestamp=1745447345
+Apr 23 15:29:05 launchpad ollama[741770]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139751249104896" timestamp=1745447345 total_threads=16
+Apr 23 15:29:05 launchpad ollama[741770]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="39331" tid="139751249104896" timestamp=1745447345
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 15:29:05 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 15:29:05 launchpad ollama[1550]: time=2025-04-23T15:29:05.844-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 15:29:05 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 15:29:05 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 15:29:05 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 15:29:05 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 15:29:05 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 15:29:06 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 15:29:06 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 15:29:06 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 15:29:06 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 15:29:06 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 15:29:06 launchpad ollama[741770]: INFO [main] model loaded | tid="139751249104896" timestamp=1745447346
+Apr 23 15:29:06 launchpad ollama[1550]: time=2025-04-23T15:29:06.848-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 15:29:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:29:07 | 200 |  1.783493384s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:29:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:29:07 | 200 |  352.608519ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:29:08 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:29:08 | 200 |  935.295941ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:29:16 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:29:16 | 200 |  7.495554825s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:31:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:31:35 | 200 |  206.886892ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:31:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:31:35 | 200 |  187.438793ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:31:36 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:31:36 | 200 |  1.066432478s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:31:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:31:43 | 200 |   7.29251208s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:34:05 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:34:05 | 200 |  225.521346ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:34:06 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:34:06 | 200 |    211.0022ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:34:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:34:07 | 200 |  1.134593019s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:34:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:34:15 | 200 |  8.252084922s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.290-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9299755008 required="6.2 GiB"
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.290-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.5 GiB" free_swap="68.9 GiB"
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.290-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.292-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40299"
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.292-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.292-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.292-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 15:43:25 launchpad ollama[743911]: INFO [main] build info | build=0 commit="unknown" tid="140164261195776" timestamp=1745448205
+Apr 23 15:43:25 launchpad ollama[743911]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140164261195776" timestamp=1745448205 total_threads=16
+Apr 23 15:43:25 launchpad ollama[743911]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40299" tid="140164261195776" timestamp=1745448205
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 15:43:25 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 15:43:25 launchpad ollama[1550]: time=2025-04-23T15:43:25.543-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 15:43:25 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 15:43:25 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 15:43:25 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 15:43:25 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 15:43:25 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 15:43:26 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 15:43:26 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 15:43:26 launchpad ollama[743911]: INFO [main] model loaded | tid="140164261195776" timestamp=1745448206
+Apr 23 15:43:26 launchpad ollama[1550]: time=2025-04-23T15:43:26.546-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 15:43:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:43:26 | 200 |   1.66424257s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:43:27 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:43:27 | 200 |  229.121682ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:43:27 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:43:27 | 200 |  922.185553ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:43:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:43:34 | 200 |  6.808054244s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.247-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297657856 required="6.2 GiB"
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.248-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.5 GiB" free_swap="68.9 GiB"
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.248-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.249-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36393"
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.249-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.249-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.249-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 15:48:58 launchpad ollama[744744]: INFO [main] build info | build=0 commit="unknown" tid="139678188761088" timestamp=1745448538
+Apr 23 15:48:58 launchpad ollama[744744]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139678188761088" timestamp=1745448538 total_threads=16
+Apr 23 15:48:58 launchpad ollama[744744]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36393" tid="139678188761088" timestamp=1745448538
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 15:48:58 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 15:48:58 launchpad ollama[1550]: time=2025-04-23T15:48:58.501-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 15:48:58 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 15:48:58 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 15:48:58 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 15:48:58 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 15:48:58 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 15:48:59 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 15:48:59 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 15:48:59 launchpad ollama[744744]: INFO [main] model loaded | tid="139678188761088" timestamp=1745448539
+Apr 23 15:48:59 launchpad ollama[1550]: time=2025-04-23T15:48:59.504-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 15:48:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:48:59 | 200 |  1.641345052s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:48:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:48:59 | 200 |  200.495439ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:49:00 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:49:00 | 200 |  918.072418ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:49:03 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:49:03 | 200 |    2.7918668s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:50:20 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:50:20 | 200 |  197.142027ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:50:20 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:50:20 | 200 |  184.553055ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:50:22 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:50:22 | 200 |  1.218516621s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:50:22 launchpad ollama[744744]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1013 n_keep=24 n_left=2024 n_shift=1012 tid="139678188761088" timestamp=1745448622
+Apr 23 15:50:24 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:50:24 | 200 |  2.043965397s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:51:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:51:33 | 200 |   204.39221ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:51:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:51:33 | 200 |  183.047103ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:51:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:51:34 | 200 |  1.156643663s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:51:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:51:40 | 200 |  5.534130701s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:53:45 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:53:45 | 200 |  207.214064ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:53:45 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:53:45 | 200 |   194.09473ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:53:46 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:53:46 | 200 |  1.079799274s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:53:49 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:53:49 | 200 |  2.536857476s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 15:54:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:54:40 | 200 |  208.041304ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:54:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:54:40 | 200 |  194.694403ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:54:41 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:54:41 | 200 |  1.229778061s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 15:54:46 launchpad ollama[1550]: [GIN] 2025/04/23 - 15:54:46 | 200 |  5.179118839s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.921-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297657856 required="6.2 GiB"
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.921-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.4 GiB" free_swap="68.9 GiB"
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.921-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.923-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41021"
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.923-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.923-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 16:01:51 launchpad ollama[1550]: time=2025-04-23T16:01:51.923-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 16:01:51 launchpad ollama[746687]: INFO [main] build info | build=0 commit="unknown" tid="139822820651008" timestamp=1745449311
+Apr 23 16:01:51 launchpad ollama[746687]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139822820651008" timestamp=1745449311 total_threads=16
+Apr 23 16:01:51 launchpad ollama[746687]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41021" tid="139822820651008" timestamp=1745449311
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 16:01:51 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 16:01:52 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 16:01:52 launchpad ollama[1550]: time=2025-04-23T16:01:52.174-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 16:01:52 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 16:01:52 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 16:01:52 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 16:01:52 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 16:01:52 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 16:01:52 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 16:01:52 launchpad ollama[746687]: INFO [main] model loaded | tid="139822820651008" timestamp=1745449312
+Apr 23 16:01:53 launchpad ollama[1550]: time=2025-04-23T16:01:53.178-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 16:01:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:01:53 | 200 |  1.706338953s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:01:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:01:53 | 200 |  261.222902ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:01:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:01:54 | 200 |   904.29523ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:01:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:01:57 | 200 |  3.144308181s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:02:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:02:07 | 200 |  169.518046ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:02:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:02:07 | 200 |  170.777694ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:02:08 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:02:08 | 200 |  1.163610601s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:02:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:02:11 | 200 |  3.037664362s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:03:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:03:56 | 200 |   233.25358ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:03:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:03:57 | 200 |  211.941961ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:03:58 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:03:58 | 200 |  1.150480481s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:03:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:03:59 | 200 |  1.591883084s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:07:01 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:07:01 | 200 |  649.941245ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:07:01 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:07:01 | 200 |  620.256672ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:07:02 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:07:02 | 200 |  1.254070605s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:07:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:07:07 | 200 |  4.512974526s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:08:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:11 | 200 |  217.346016ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:08:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:11 | 200 |  190.966444ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:08:12 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:12 | 200 |  1.203683956s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:08:20 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:20 | 200 |  8.272710161s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:08:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:52 | 200 |  212.727769ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:08:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:52 | 200 |  191.783099ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:08:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:53 | 200 |  1.240007945s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:08:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:08:57 | 200 |  4.110226989s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:10:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:10:33 | 200 |  205.415213ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:10:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:10:33 | 200 |  195.861841ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:10:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:10:35 | 200 |  1.258611531s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:10:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:10:38 | 200 |  3.649750446s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:11:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:11:43 | 200 |  198.033315ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:11:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:11:43 | 200 |  184.186461ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:11:44 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:11:44 | 200 |   1.19371359s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:11:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:11:53 | 200 |   8.97736957s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:14:04 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:14:04 | 200 |  262.971922ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:14:05 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:14:05 | 200 |  229.765107ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:14:06 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:14:06 | 200 |  1.269428039s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:14:11 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:14:11 | 200 |   5.09113946s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:15:42 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:15:42 | 200 |  204.459233ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:15:42 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:15:42 | 200 |   184.23007ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:15:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:15:43 | 200 |  892.046879ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:15:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:15:51 | 200 |  7.945472274s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:19:02 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:19:02 | 200 |  225.963756ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:19:02 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:19:02 | 200 |  214.736421ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:19:03 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:19:03 | 200 |  973.320591ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:19:08 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:19:08 | 200 |  4.787175055s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:23:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:23:48 | 200 |  193.996215ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:23:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:23:48 | 200 |   181.40443ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:23:49 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:23:49 | 200 |  1.249854336s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:23:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:23:52 | 200 |  2.889936862s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:25:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:25:50 | 200 |  207.587684ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:25:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:25:50 | 200 |  190.318979ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:25:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:25:51 | 200 |  1.116785127s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:25:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:25:54 | 200 |  3.055486338s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.719-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9206300672 required="6.2 GiB"
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.719-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.1 GiB" free_swap="68.9 GiB"
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.719-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.721-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37471"
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.721-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.721-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.721-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 16:37:05 launchpad ollama[752373]: INFO [main] build info | build=0 commit="unknown" tid="140589304725504" timestamp=1745451425
+Apr 23 16:37:05 launchpad ollama[752373]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140589304725504" timestamp=1745451425 total_threads=16
+Apr 23 16:37:05 launchpad ollama[752373]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37471" tid="140589304725504" timestamp=1745451425
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 16:37:05 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 16:37:05 launchpad ollama[1550]: time=2025-04-23T16:37:05.971-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 16:37:05 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 16:37:06 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 16:37:06 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 16:37:06 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 16:37:06 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 16:37:06 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 16:37:06 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 16:37:06 launchpad ollama[752373]: INFO [main] model loaded | tid="140589304725504" timestamp=1745451426
+Apr 23 16:37:06 launchpad ollama[1550]: time=2025-04-23T16:37:06.975-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 16:37:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:37:07 | 200 |  1.715130566s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:37:07 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:37:07 | 200 |  277.059508ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:37:08 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:37:08 | 200 |  993.749274ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:37:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:37:15 | 200 |  7.161703414s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:38:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:38:51 | 200 |  214.272645ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:38:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:38:51 | 200 |  194.126252ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:38:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:38:52 | 200 |   1.22954814s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:38:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:38:57 | 200 |  5.296634418s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:43:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:43:38 | 200 |  270.675166ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:43:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:43:38 | 200 |  256.470661ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:43:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:43:39 | 200 |  1.217220835s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:43:46 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:43:46 | 200 |  6.269839591s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:45:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:45:21 | 200 |  200.567383ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:45:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:45:21 | 200 |  181.252238ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:45:22 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:45:22 | 200 |  1.158915419s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:45:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:45:26 | 200 |   4.41121607s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.211-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9300279296 required="6.2 GiB"
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.211-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.1 GiB" free_swap="68.9 GiB"
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.211-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.213-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36365"
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.213-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.213-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.213-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 16:54:56 launchpad ollama[755027]: INFO [main] build info | build=0 commit="unknown" tid="139849870778368" timestamp=1745452496
+Apr 23 16:54:56 launchpad ollama[755027]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139849870778368" timestamp=1745452496 total_threads=16
+Apr 23 16:54:56 launchpad ollama[755027]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36365" tid="139849870778368" timestamp=1745452496
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 16:54:56 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 16:54:56 launchpad ollama[1550]: time=2025-04-23T16:54:56.464-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 16:54:56 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 16:54:56 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 16:54:56 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 16:54:56 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 16:54:56 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 16:54:57 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 16:54:57 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 16:54:57 launchpad ollama[755027]: INFO [main] model loaded | tid="139849870778368" timestamp=1745452497
+Apr 23 16:54:57 launchpad ollama[1550]: time=2025-04-23T16:54:57.468-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 16:54:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:54:57 | 200 |  1.733030575s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:54:58 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:54:58 | 200 |  291.119439ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:54:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:54:59 | 200 |  969.844968ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 16:55:05 launchpad ollama[1550]: [GIN] 2025/04/23 - 16:55:05 | 200 |  6.301849199s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.094-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9300803584 required="6.2 GiB"
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.094-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.094-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.096-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43175"
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.096-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.096-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.096-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 17:08:14 launchpad ollama[757052]: INFO [main] build info | build=0 commit="unknown" tid="140225427890176" timestamp=1745453294
+Apr 23 17:08:14 launchpad ollama[757052]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140225427890176" timestamp=1745453294 total_threads=16
+Apr 23 17:08:14 launchpad ollama[757052]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43175" tid="140225427890176" timestamp=1745453294
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 17:08:14 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 17:08:14 launchpad ollama[1550]: time=2025-04-23T17:08:14.347-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 17:08:14 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 17:08:14 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 17:08:14 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 17:08:14 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 17:08:14 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 17:08:15 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 17:08:15 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 17:08:15 launchpad ollama[757052]: INFO [main] model loaded | tid="140225427890176" timestamp=1745453295
+Apr 23 17:08:15 launchpad ollama[1550]: time=2025-04-23T17:08:15.350-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 17:08:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:08:15 | 200 |  1.655003829s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:08:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:08:15 | 200 |  218.820867ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:08:16 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:08:16 | 200 |  945.013966ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:08:22 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:08:22 | 200 |  5.842121313s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.791-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9300803584 required="6.2 GiB"
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.791-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.792-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.793-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43739"
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.793-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.793-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 17:15:54 launchpad ollama[1550]: time=2025-04-23T17:15:54.794-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 17:15:54 launchpad ollama[758218]: INFO [main] build info | build=0 commit="unknown" tid="139965543305216" timestamp=1745453754
+Apr 23 17:15:54 launchpad ollama[758218]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139965543305216" timestamp=1745453754 total_threads=16
+Apr 23 17:15:54 launchpad ollama[758218]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43739" tid="139965543305216" timestamp=1745453754
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 17:15:54 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 17:15:55 launchpad ollama[1550]: time=2025-04-23T17:15:55.045-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 17:15:55 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 17:15:55 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 17:15:55 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 17:15:55 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 17:15:55 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 17:15:55 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 17:15:55 launchpad ollama[758218]: INFO [main] model loaded | tid="139965543305216" timestamp=1745453755
+Apr 23 17:15:56 launchpad ollama[1550]: time=2025-04-23T17:15:56.049-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 17:15:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:15:56 | 200 |  1.665246282s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:15:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:15:56 | 200 |  228.248626ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:15:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:15:57 | 200 |  908.149474ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:16:03 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:16:03 | 200 |  6.195206239s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.716-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9300279296 required="6.2 GiB"
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.716-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.716-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.718-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 32867"
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.718-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.718-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.718-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 17:24:35 launchpad ollama[759652]: INFO [main] build info | build=0 commit="unknown" tid="139971416432640" timestamp=1745454275
+Apr 23 17:24:35 launchpad ollama[759652]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139971416432640" timestamp=1745454275 total_threads=16
+Apr 23 17:24:35 launchpad ollama[759652]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32867" tid="139971416432640" timestamp=1745454275
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 17:24:35 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 17:24:35 launchpad ollama[1550]: time=2025-04-23T17:24:35.969-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 17:24:35 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 17:24:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 17:24:36 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 17:24:36 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 17:24:36 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 17:24:36 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 17:24:36 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 17:24:36 launchpad ollama[759652]: INFO [main] model loaded | tid="139971416432640" timestamp=1745454276
+Apr 23 17:24:36 launchpad ollama[1550]: time=2025-04-23T17:24:36.973-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 17:24:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:24:37 | 200 |  1.715784962s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:24:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:24:37 | 200 |  272.002243ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:24:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:24:38 | 200 |  891.992735ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:24:44 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:24:44 | 200 |   6.42036935s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.556-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9296609280 required="6.2 GiB"
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.556-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.556-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.558-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43877"
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.558-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.558-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.558-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 17:29:54 launchpad ollama[760480]: INFO [main] build info | build=0 commit="unknown" tid="139761445388288" timestamp=1745454594
+Apr 23 17:29:54 launchpad ollama[760480]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139761445388288" timestamp=1745454594 total_threads=16
+Apr 23 17:29:54 launchpad ollama[760480]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43877" tid="139761445388288" timestamp=1745454594
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 17:29:54 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 17:29:54 launchpad ollama[1550]: time=2025-04-23T17:29:54.809-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 17:29:54 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 17:29:54 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 17:29:54 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 17:29:54 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 17:29:54 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 17:29:55 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 17:29:55 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 17:29:55 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 17:29:55 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 17:29:55 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 17:29:55 launchpad ollama[760480]: INFO [main] model loaded | tid="139761445388288" timestamp=1745454595
+Apr 23 17:29:55 launchpad ollama[1550]: time=2025-04-23T17:29:55.812-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 17:29:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:29:56 | 200 |  1.647709406s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:29:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:29:56 | 200 |  218.040739ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:29:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:29:57 | 200 |  931.171771ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:30:03 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:30:03 | 200 |  6.553792836s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:33:41 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:33:41 | 200 |  5.513200945s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:33:47 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:33:47 | 200 |  5.951761173s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:33:52 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:33:52 | 200 |  5.324029464s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:33:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:33:57 | 200 |  4.988908165s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:03 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:03 | 200 |  5.481583284s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:08 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:08 | 200 |  5.636927445s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:13 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:13 | 200 |  4.973832389s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:20 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:20 | 200 |  6.429673412s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:26 | 200 |  6.170376806s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:31 | 200 |  5.472332692s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:37 | 200 |  5.745230658s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:43 | 200 |  5.290740391s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:48 | 200 |  5.373433295s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:53 | 200 |  5.298728666s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:34:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:34:59 | 200 |  5.631392826s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:04 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:04 | 200 |  5.330692538s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:09 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:09 | 200 |  5.099768709s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:15 | 200 |  5.463271492s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:21 | 200 |  5.622586806s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:26 | 200 |  5.416842498s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:31 | 200 |   5.38208324s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:37 | 200 |  5.115517484s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:42 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:42 | 200 |  5.388257945s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:48 | 200 |  5.700935744s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:53 | 200 |  4.827407192s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:35:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:35:59 | 200 |  5.795170652s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:04 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:04 | 200 |  5.613308453s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:10 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:10 | 200 |  5.792461306s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:15 | 200 |  5.441378647s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:21 | 200 |  5.054836714s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:26 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:26 | 200 |  5.354167418s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:31 | 200 |  5.127645916s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:35 | 200 |  3.498165595s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:55 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:55 | 200 |  1.124623728s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:36:55 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:55 | 200 |   190.57658ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:56 | 200 |  191.353147ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:57 | 200 |  1.177496813s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:57 | 200 |  109.674102ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:57 | 200 |  193.270449ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:36:58 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:36:58 | 200 |  925.295133ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:37:04 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:37:04 | 200 |  5.495618968s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:41:32 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:32 | 200 |   1.35067355s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:41:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:33 | 200 |  236.562616ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:41:33 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:33 | 200 |  235.158709ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:41:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:34 | 200 |  1.151250864s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:41:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:34 | 200 |    113.8258ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:41:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:34 | 200 |  195.821102ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:41:36 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:36 | 200 |  1.156124603s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:41:43 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:41:43 | 200 |  7.450401236s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:46:56 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:46:56 | 200 |      25.745µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 17:48:14 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:48:14 | 200 |      24.329µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 17:48:17 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:48:17 | 200 |      26.686µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.189-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9115729920 required="6.2 GiB"
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.189-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.2 GiB" free_swap="68.9 GiB"
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.189-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.191-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43095"
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.191-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.191-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.191-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 17:49:13 launchpad ollama[763723]: INFO [main] build info | build=0 commit="unknown" tid="139627022245888" timestamp=1745455753
+Apr 23 17:49:13 launchpad ollama[763723]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139627022245888" timestamp=1745455753 total_threads=16
+Apr 23 17:49:13 launchpad ollama[763723]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43095" tid="139627022245888" timestamp=1745455753
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 17:49:13 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 17:49:13 launchpad ollama[1550]: time=2025-04-23T17:49:13.442-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 17:49:13 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 17:49:13 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 17:49:13 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 17:49:13 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 17:49:13 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 17:49:14 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 17:49:14 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 17:49:14 launchpad ollama[763723]: INFO [main] model loaded | tid="139627022245888" timestamp=1745455754
+Apr 23 17:49:14 launchpad ollama[1550]: time=2025-04-23T17:49:14.446-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 23 17:49:14 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:14 | 200 |  1.621844807s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:49:14 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:14 | 200 |  178.194889ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:49:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:15 | 200 |  986.923716ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:49:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:15 | 200 |   95.944006ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:49:16 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:16 | 200 |  177.628998ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:49:17 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:17 | 200 |  902.883751ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:49:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:49:21 | 200 |  4.746923904s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:50:06 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:50:06 | 200 |     624.041µs |       127.0.0.1 | GET      "/api/tags"
+Apr 23 17:50:10 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:50:10 | 200 |       29.31µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 17:51:09 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:51:09 | 200 |  201.501533ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:51:09 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:51:09 | 200 |  188.590244ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:51:10 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:51:10 | 200 |  1.141851794s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:51:14 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:51:14 | 200 |  4.147923929s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:51:15 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:51:15 | 200 |  484.995762ms |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:51:17 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:51:17 | 200 |  2.396052006s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:52:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:52:31 | 200 |  194.429789ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:52:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:52:31 | 200 |  174.473469ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:52:32 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:52:32 | 200 |  1.072062433s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:52:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:52:35 | 200 |  2.747763494s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:53:00 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:53:00 | 200 |       33.53µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 17:54:28 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:28 | 200 |  205.822094ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:54:28 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:28 | 200 |  191.084425ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:54:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:29 | 200 |  1.216033327s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:54:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:29 | 200 |  107.195288ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:54:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:29 | 200 |  185.226544ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:54:30 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:30 | 200 |  957.627968ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:54:36 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:54:36 | 200 |  5.847705896s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:55:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:55:59 | 200 |  199.620826ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:55:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:55:59 | 200 |  187.281075ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:56:01 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:56:01 | 200 |  1.391120446s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:56:04 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:56:04 | 200 |  3.404000606s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 17:56:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:56:57 | 200 |  196.926397ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:56:57 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:56:57 | 200 |  182.929148ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:56:58 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:56:58 | 200 |  1.340671128s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 17:57:00 launchpad ollama[1550]: [GIN] 2025/04/23 - 17:57:00 | 200 |  1.768564865s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:00:22 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:00:22 | 200 |      84.783µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 18:03:00 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:03:00 | 200 |      41.155µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 18:04:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:04:50 | 200 |     590.418µs |       127.0.0.1 | GET      "/api/tags"
+Apr 23 18:05:06 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:05:06 | 200 |     590.438µs |       127.0.0.1 | GET      "/api/tags"
+Apr 23 18:05:32 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:05:32 | 200 |      24.818µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 18:05:42 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:05:42 | 200 |       24.93µs |       127.0.0.1 | GET      "/api/version"
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.076-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9156427776 required="6.2 GiB"
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.076-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.9 GiB" free_swap="68.9 GiB"
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.076-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.078-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44079"
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.078-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.078-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.078-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 18:07:46 launchpad ollama[766850]: INFO [main] build info | build=0 commit="unknown" tid="140340407275520" timestamp=1745456866
+Apr 23 18:07:46 launchpad ollama[766850]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140340407275520" timestamp=1745456866 total_threads=16
+Apr 23 18:07:46 launchpad ollama[766850]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44079" tid="140340407275520" timestamp=1745456866
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 18:07:46 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 18:07:46 launchpad ollama[1550]: time=2025-04-23T18:07:46.329-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 18:07:46 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 18:07:46 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 18:07:46 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 18:07:46 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 18:07:46 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 18:07:47 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 18:07:47 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 18:07:47 launchpad ollama[766850]: INFO [main] model loaded | tid="140340407275520" timestamp=1745456867
+Apr 23 18:07:47 launchpad ollama[1550]: time=2025-04-23T18:07:47.332-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 18:07:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:48 | 200 |  2.548020507s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:07:49 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:49 | 200 |  1.111278657s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:07:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:50 | 200 |  1.148013528s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:07:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:51 | 200 |  1.095264503s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:07:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:53 | 200 |  1.150553696s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:07:54 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:54 | 200 |  1.095690186s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:07:59 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:07:59 | 200 |  4.989010415s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:08:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:37 | 200 |  201.013496ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:08:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:38 | 200 |  186.500957ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:08:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:39 | 200 |  1.124483688s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:08:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:39 | 200 |  103.263269ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:08:39 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:39 | 200 |  182.676138ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:08:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:40 | 200 |  1.222491182s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:08:46 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:08:46 | 200 |  5.496173932s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:11:16 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:16 | 200 |  241.451316ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:11:17 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:17 | 200 |  224.899449ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:11:18 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:18 | 200 |  1.166300681s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:11:18 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:18 | 200 |  143.182785ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:11:18 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:18 | 200 |  225.587214ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:11:19 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:19 | 200 |  1.280338269s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:11:27 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:11:27 | 200 |  7.219407188s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.138-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9196863488 required="6.2 GiB"
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.138-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.7 GiB" free_swap="68.9 GiB"
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.138-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.140-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33497"
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.140-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.140-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.140-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 23 18:18:18 launchpad ollama[768669]: INFO [main] build info | build=0 commit="unknown" tid="139795465789440" timestamp=1745457498
+Apr 23 18:18:18 launchpad ollama[768669]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139795465789440" timestamp=1745457498 total_threads=16
+Apr 23 18:18:18 launchpad ollama[768669]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33497" tid="139795465789440" timestamp=1745457498
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 23 18:18:18 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 23 18:18:18 launchpad ollama[1550]: time=2025-04-23T18:18:18.390-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 23 18:18:18 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 23 18:18:18 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 23 18:18:18 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 23 18:18:18 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 23 18:18:18 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 23 18:18:19 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 23 18:18:19 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 23 18:18:19 launchpad ollama[768669]: INFO [main] model loaded | tid="139795465789440" timestamp=1745457499
+Apr 23 18:18:19 launchpad ollama[1550]: time=2025-04-23T18:18:19.395-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 23 18:18:19 launchpad ollama[768669]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="139795465789440" timestamp=1745457499
+Apr 23 18:18:20 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:20 | 200 |   2.35511796s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:18:20 launchpad ollama[768669]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="139795465789440" timestamp=1745457500
+Apr 23 18:18:21 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:21 | 200 |  910.036838ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:18:22 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:22 | 200 |  1.081006708s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:18:22 launchpad ollama[768669]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="139795465789440" timestamp=1745457502
+Apr 23 18:18:23 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:23 | 200 |  832.260372ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:18:23 launchpad ollama[768669]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1025 n_keep=0 n_left=2048 n_shift=1024 tid="139795465789440" timestamp=1745457503
+Apr 23 18:18:24 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:24 | 200 |  911.856095ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:18:25 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:25 | 200 |   1.26152979s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:18:25 launchpad ollama[768669]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1661 n_keep=24 n_left=2024 n_shift=1012 tid="139795465789440" timestamp=1745457505
+Apr 23 18:18:30 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:18:30 | 200 |  4.607431092s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:19:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:35 | 200 |  195.270104ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:19:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:35 | 200 |  182.918735ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:19:36 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:36 | 200 |  1.129369059s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:19:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:37 | 200 |   97.142778ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:19:37 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:37 | 200 |  176.400084ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:19:38 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:38 | 200 |  1.102297743s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:19:40 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:19:40 | 200 |   2.25545781s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:22:27 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:27 | 200 |  613.790527ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:22:28 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:28 | 200 |  579.137029ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:22:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:29 | 200 |  1.237361934s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:22:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:29 | 200 |  503.335489ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:22:30 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:30 | 200 |  605.293164ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:22:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:31 | 200 |  988.700665ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:22:35 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:22:35 | 200 |  4.192560785s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:23:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:29 | 200 |  194.169262ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:29 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:29 | 200 |  180.586316ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:30 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:30 | 200 |  1.140924415s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:30 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:30 | 200 |   60.542204ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:31 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:31 | 200 |  181.996175ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:32 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:32 | 200 |  1.130667312s |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:34 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:34 | 200 |  1.894084877s |       127.0.0.1 | POST     "/api/chat"
+Apr 23 18:23:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:48 | 200 |  173.878237ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:48 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:48 | 200 |   174.43425ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:49 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:49 | 200 |  981.156606ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:50 | 200 |  129.357036ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:50 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:50 | 200 |  169.839608ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:51 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:51 | 200 |  747.644882ms |       127.0.0.1 | POST     "/api/embed"
+Apr 23 18:23:53 launchpad ollama[1550]: [GIN] 2025/04/23 - 18:23:53 | 200 |  2.242286178s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 08:40:18 launchpad ollama[1550]: [GIN] 2025/04/24 - 08:40:18 | 200 |     618.408µs |       127.0.0.1 | GET      "/api/tags"
+Apr 24 08:40:18 launchpad ollama[1550]: [GIN] 2025/04/24 - 08:40:18 | 200 |      28.394µs |       127.0.0.1 | GET      "/api/version"
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.589-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9296543744 required="6.2 GiB"
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.589-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.6 GiB" free_swap="68.9 GiB"
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.590-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.591-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44627"
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.591-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.591-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.591-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 08:42:17 launchpad ollama[792029]: INFO [main] build info | build=0 commit="unknown" tid="139752163717120" timestamp=1745509337
+Apr 24 08:42:17 launchpad ollama[792029]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139752163717120" timestamp=1745509337 total_threads=16
+Apr 24 08:42:17 launchpad ollama[792029]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44627" tid="139752163717120" timestamp=1745509337
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 08:42:17 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 08:42:17 launchpad ollama[1550]: time=2025-04-24T08:42:17.842-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 08:42:17 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 08:42:17 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 08:42:17 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 08:42:17 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 08:42:17 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 08:42:18 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 08:42:18 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 08:42:18 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 08:42:18 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 08:42:18 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 08:42:18 launchpad ollama[792029]: INFO [main] model loaded | tid="139752163717120" timestamp=1745509338
+Apr 24 08:42:18 launchpad ollama[1550]: time=2025-04-24T08:42:18.845-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 08:42:24 launchpad ollama[1550]: [GIN] 2025/04/24 - 08:42:24 | 200 |  6.819686708s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.002-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9200992256 required="6.2 GiB"
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.002-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.002-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.003-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 38937"
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.003-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.003-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.004-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 08:53:33 launchpad ollama[795550]: INFO [main] build info | build=0 commit="unknown" tid="139898628452352" timestamp=1745510013
+Apr 24 08:53:33 launchpad ollama[795550]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139898628452352" timestamp=1745510013 total_threads=16
+Apr 24 08:53:33 launchpad ollama[795550]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="38937" tid="139898628452352" timestamp=1745510013
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 08:53:33 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 08:53:33 launchpad ollama[1550]: time=2025-04-24T08:53:33.255-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 08:53:33 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 08:53:33 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 08:53:33 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 08:53:33 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 08:53:33 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 08:53:33 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 08:53:34 launchpad ollama[795550]: INFO [main] model loaded | tid="139898628452352" timestamp=1745510014
+Apr 24 08:53:34 launchpad ollama[1550]: time=2025-04-24T08:53:34.258-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 08:53:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 08:53:40 | 200 |   7.32566291s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 08:54:23 launchpad ollama[1550]: [GIN] 2025/04/24 - 08:54:23 | 200 |  5.712938789s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 08:56:32 launchpad ollama[1550]: [GIN] 2025/04/24 - 08:56:32 | 200 |   4.05194468s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.887-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9297854464 required="6.2 GiB"
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.887-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.5 GiB" free_swap="68.9 GiB"
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.888-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.888-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43035"
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.889-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.889-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 09:07:28 launchpad ollama[1550]: time=2025-04-24T09:07:28.889-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 09:07:28 launchpad ollama[801791]: INFO [main] build info | build=0 commit="unknown" tid="140587686600704" timestamp=1745510848
+Apr 24 09:07:28 launchpad ollama[801791]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140587686600704" timestamp=1745510848 total_threads=16
+Apr 24 09:07:28 launchpad ollama[801791]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="43035" tid="140587686600704" timestamp=1745510848
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 09:07:28 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 09:07:29 launchpad ollama[1550]: time=2025-04-24T09:07:29.139-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 09:07:29 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 09:07:29 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 09:07:29 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 09:07:29 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 09:07:29 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 09:07:29 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 09:07:29 launchpad ollama[801791]: INFO [main] model loaded | tid="140587686600704" timestamp=1745510849
+Apr 24 09:07:30 launchpad ollama[1550]: time=2025-04-24T09:07:30.143-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 09:07:36 launchpad ollama[1550]: [GIN] 2025/04/24 - 09:07:36 | 200 |  7.880096058s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 09:11:52 launchpad ollama[1550]: [GIN] 2025/04/24 - 09:11:52 | 200 |  5.226571422s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 09:13:00 launchpad ollama[1550]: [GIN] 2025/04/24 - 09:13:00 | 200 |  4.637749177s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.633-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9295757312 required="6.2 GiB"
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.633-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.5 GiB" free_swap="68.9 GiB"
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.633-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.634-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33465"
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.634-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.634-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.635-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 09:19:10 launchpad ollama[804911]: INFO [main] build info | build=0 commit="unknown" tid="139956127256576" timestamp=1745511550
+Apr 24 09:19:10 launchpad ollama[804911]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139956127256576" timestamp=1745511550 total_threads=16
+Apr 24 09:19:10 launchpad ollama[804911]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33465" tid="139956127256576" timestamp=1745511550
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 09:19:10 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 09:19:10 launchpad ollama[1550]: time=2025-04-24T09:19:10.886-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 09:19:10 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 09:19:10 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 09:19:10 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 09:19:10 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 09:19:10 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 09:19:11 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 09:19:11 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 09:19:11 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 09:19:11 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 09:19:11 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 09:19:11 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 09:19:11 launchpad ollama[804911]: INFO [main] model loaded | tid="139956127256576" timestamp=1745511551
+Apr 24 09:19:11 launchpad ollama[1550]: time=2025-04-24T09:19:11.890-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 24 09:19:18 launchpad ollama[1550]: [GIN] 2025/04/24 - 09:19:18 | 200 |  7.632560487s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.324-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9097248768 required="6.2 GiB"
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.324-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.325-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.326-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 34541"
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.326-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.326-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.326-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 09:40:31 launchpad ollama[811227]: INFO [main] build info | build=0 commit="unknown" tid="140467655323648" timestamp=1745512831
+Apr 24 09:40:31 launchpad ollama[811227]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140467655323648" timestamp=1745512831 total_threads=16
+Apr 24 09:40:31 launchpad ollama[811227]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="34541" tid="140467655323648" timestamp=1745512831
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 09:40:31 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 09:40:31 launchpad ollama[1550]: time=2025-04-24T09:40:31.577-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 09:40:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 09:40:31 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 09:40:31 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 09:40:31 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 09:40:31 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 09:40:32 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 09:40:32 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 09:40:32 launchpad ollama[811227]: INFO [main] model loaded | tid="140467655323648" timestamp=1745512832
+Apr 24 09:40:32 launchpad ollama[1550]: time=2025-04-24T09:40:32.581-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 24 09:40:41 launchpad ollama[1550]: [GIN] 2025/04/24 - 09:40:41 | 200 |  9.911681647s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.517-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9051439104 required="6.2 GiB"
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.517-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.518-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.518-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 33395"
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.519-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.519-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.519-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 09:59:53 launchpad ollama[817802]: INFO [main] build info | build=0 commit="unknown" tid="140658720501760" timestamp=1745513993
+Apr 24 09:59:53 launchpad ollama[817802]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140658720501760" timestamp=1745513993 total_threads=16
+Apr 24 09:59:53 launchpad ollama[817802]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="33395" tid="140658720501760" timestamp=1745513993
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 09:59:53 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 09:59:53 launchpad ollama[1550]: time=2025-04-24T09:59:53.769-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 09:59:53 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 09:59:53 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 09:59:53 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 09:59:53 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 09:59:53 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 09:59:54 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 09:59:54 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 09:59:54 launchpad ollama[817802]: INFO [main] model loaded | tid="140658720501760" timestamp=1745513994
+Apr 24 09:59:54 launchpad ollama[1550]: time=2025-04-24T09:59:54.774-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 24 10:00:00 launchpad ollama[1550]: [GIN] 2025/04/24 - 10:00:00 | 200 |  7.142717191s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 10:03:27 launchpad ollama[1550]: [GIN] 2025/04/24 - 10:03:27 | 200 |  8.348778676s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.302-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8985182208 required="6.2 GiB"
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.302-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.2 GiB" free_swap="68.9 GiB"
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.302-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.303-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 41833"
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.304-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.304-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.304-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 10:09:07 launchpad ollama[820062]: INFO [main] build info | build=0 commit="unknown" tid="139819060563968" timestamp=1745514547
+Apr 24 10:09:07 launchpad ollama[820062]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139819060563968" timestamp=1745514547 total_threads=16
+Apr 24 10:09:07 launchpad ollama[820062]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="41833" tid="139819060563968" timestamp=1745514547
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 10:09:07 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 10:09:07 launchpad ollama[1550]: time=2025-04-24T10:09:07.554-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 10:09:07 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 10:09:07 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 10:09:07 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 10:09:07 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 10:09:07 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 10:09:08 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 10:09:08 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 10:09:08 launchpad ollama[820062]: INFO [main] model loaded | tid="139819060563968" timestamp=1745514548
+Apr 24 10:09:08 launchpad ollama[1550]: time=2025-04-24T10:09:08.558-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 10:09:13 launchpad ollama[1550]: [GIN] 2025/04/24 - 10:09:13 | 200 |  6.476721392s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.087-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9072279552 required="6.2 GiB"
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.087-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.8 GiB" free_swap="68.9 GiB"
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.087-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.088-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35685"
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.088-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.088-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.089-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 10:39:25 launchpad ollama[828764]: INFO [main] build info | build=0 commit="unknown" tid="139716123848704" timestamp=1745516365
+Apr 24 10:39:25 launchpad ollama[828764]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139716123848704" timestamp=1745516365 total_threads=16
+Apr 24 10:39:25 launchpad ollama[828764]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35685" tid="139716123848704" timestamp=1745516365
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 10:39:25 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 10:39:25 launchpad ollama[1550]: time=2025-04-24T10:39:25.339-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 10:39:25 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 10:39:25 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 10:39:25 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 10:39:25 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 10:39:25 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 10:39:26 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 10:39:26 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 10:39:26 launchpad ollama[828764]: INFO [main] model loaded | tid="139716123848704" timestamp=1745516366
+Apr 24 10:39:26 launchpad ollama[1550]: time=2025-04-24T10:39:26.343-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 10:39:34 launchpad ollama[1550]: [GIN] 2025/04/24 - 10:39:34 | 200 |  9.279915837s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 10:42:31 launchpad ollama[1550]: [GIN] 2025/04/24 - 10:42:31 | 200 |  8.154876417s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 11:11:08 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:11:08 | 200 |       688.1µs |       127.0.0.1 | GET      "/api/tags"
+Apr 24 11:11:08 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:11:08 | 200 |      24.276µs |       127.0.0.1 | GET      "/api/version"
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.161-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9106620416 required="6.2 GiB"
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.161-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.0 GiB" free_swap="68.9 GiB"
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.162-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.163-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46613"
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.163-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.163-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.163-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 11:11:53 launchpad ollama[836914]: INFO [main] build info | build=0 commit="unknown" tid="140295154032640" timestamp=1745518313
+Apr 24 11:11:53 launchpad ollama[836914]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140295154032640" timestamp=1745518313 total_threads=16
+Apr 24 11:11:53 launchpad ollama[836914]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46613" tid="140295154032640" timestamp=1745518313
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 11:11:53 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 11:11:53 launchpad ollama[1550]: time=2025-04-24T11:11:53.414-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 11:11:53 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 11:11:53 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 11:11:53 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 11:11:53 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 11:11:53 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 11:11:54 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 11:11:54 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 11:11:54 launchpad ollama[836914]: INFO [main] model loaded | tid="140295154032640" timestamp=1745518314
+Apr 24 11:11:54 launchpad ollama[1550]: time=2025-04-24T11:11:54.418-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 24 11:12:00 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:12:00 | 200 |  7.164835811s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 11:13:23 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:13:23 | 200 |  3.528328901s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 11:52:49 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:52:49 | 200 |     767.714µs |       127.0.0.1 | GET      "/api/tags"
+Apr 24 11:52:49 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:52:49 | 200 |      29.886µs |       127.0.0.1 | GET      "/api/version"
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.258-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9094627328 required="6.2 GiB"
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.258-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.7 GiB" free_swap="68.9 GiB"
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.258-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.259-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45549"
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.259-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.259-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.259-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 11:53:05 launchpad ollama[849688]: INFO [main] build info | build=0 commit="unknown" tid="140288459870208" timestamp=1745520785
+Apr 24 11:53:05 launchpad ollama[849688]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140288459870208" timestamp=1745520785 total_threads=16
+Apr 24 11:53:05 launchpad ollama[849688]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45549" tid="140288459870208" timestamp=1745520785
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 11:53:05 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 11:53:05 launchpad ollama[1550]: time=2025-04-24T11:53:05.510-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 11:53:05 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 11:53:05 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 11:53:05 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 11:53:05 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 11:53:05 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 11:53:06 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 11:53:06 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 11:53:06 launchpad ollama[849688]: INFO [main] model loaded | tid="140288459870208" timestamp=1745520786
+Apr 24 11:53:06 launchpad ollama[1550]: time=2025-04-24T11:53:06.514-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 11:53:07 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:53:07 | 200 |  2.185362823s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 11:53:55 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:53:55 | 200 |  197.529098ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 11:53:55 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:53:55 | 200 |  182.645663ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 11:53:56 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:53:56 | 200 |  794.044825ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 11:54:01 launchpad ollama[1550]: [GIN] 2025/04/24 - 11:54:01 | 200 |  4.798343251s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 17:36:21 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:36:21 | 200 |     989.017µs |       127.0.0.1 | GET      "/api/tags"
+Apr 24 17:36:21 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:36:21 | 200 |      31.076µs |       127.0.0.1 | GET      "/api/version"
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.498-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8993701888 required="6.2 GiB"
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.498-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.498-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.499-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45731"
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.500-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.500-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.500-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 17:37:24 launchpad ollama[907052]: INFO [main] build info | build=0 commit="unknown" tid="140650386157568" timestamp=1745541444
+Apr 24 17:37:24 launchpad ollama[907052]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140650386157568" timestamp=1745541444 total_threads=16
+Apr 24 17:37:24 launchpad ollama[907052]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45731" tid="140650386157568" timestamp=1745541444
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 17:37:24 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 17:37:24 launchpad ollama[1550]: time=2025-04-24T17:37:24.750-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 17:37:24 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 17:37:24 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 17:37:24 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 17:37:24 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 17:37:24 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 17:37:25 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 17:37:25 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 17:37:25 launchpad ollama[907052]: INFO [main] model loaded | tid="140650386157568" timestamp=1745541445
+Apr 24 17:37:25 launchpad ollama[1550]: time=2025-04-24T17:37:25.755-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 17:37:26 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:26 | 200 |  2.090572142s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:37:27 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:27 | 200 |  641.604522ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:37:28 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:28 | 200 |   1.12206052s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:37:28 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:28 | 200 |  602.114962ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:37:29 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:29 | 200 |  643.702023ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:37:30 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:30 | 200 |  947.871468ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:37:35 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:37:35 | 200 |  5.141216689s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 17:41:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:37 | 200 |  253.821343ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:41:38 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:38 | 200 |   233.74639ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:41:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:39 | 200 |  1.288696981s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:41:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:39 | 200 |  151.585566ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:41:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:39 | 200 |  230.968708ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:41:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:40 | 200 |  1.129138278s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:41:49 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:41:49 | 200 |  8.188421146s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.915-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9019129856 required="6.2 GiB"
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.915-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.3 GiB" free_swap="68.9 GiB"
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.915-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.917-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 36907"
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.917-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.917-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 17:48:43 launchpad ollama[1550]: time=2025-04-24T17:48:43.917-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 17:48:43 launchpad ollama[908846]: INFO [main] build info | build=0 commit="unknown" tid="140427190984704" timestamp=1745542123
+Apr 24 17:48:43 launchpad ollama[908846]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140427190984704" timestamp=1745542123 total_threads=16
+Apr 24 17:48:43 launchpad ollama[908846]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36907" tid="140427190984704" timestamp=1745542123
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 17:48:43 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 17:48:44 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 17:48:44 launchpad ollama[1550]: time=2025-04-24T17:48:44.168-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 17:48:44 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 17:48:44 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 17:48:44 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 17:48:44 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 17:48:44 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 17:48:44 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 17:48:44 launchpad ollama[908846]: INFO [main] model loaded | tid="140427190984704" timestamp=1745542124
+Apr 24 17:48:45 launchpad ollama[1550]: time=2025-04-24T17:48:45.173-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 24 17:48:45 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:45 | 200 |  1.710108976s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:48:45 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:45 | 200 |  261.643597ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:48:46 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:46 | 200 |  920.281896ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:48:46 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:46 | 200 |  140.932473ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:48:47 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:47 | 200 |  262.296399ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:48:48 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:48 | 200 |  931.731337ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:48:54 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:48:54 | 200 |  6.073523978s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 17:50:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:40 | 200 |  200.958593ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:50:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:40 | 200 |  184.805268ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:50:41 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:41 | 200 |  1.323112494s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:50:41 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:41 | 200 |  139.446503ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:50:41 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:41 | 200 |  179.133677ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:50:43 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:43 | 200 |  1.105684492s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:50:47 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:50:47 | 200 |  4.172603628s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 17:54:17 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:17 | 200 |  252.141997ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:54:17 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:17 | 200 |  230.037406ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:54:19 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:19 | 200 |  1.238542557s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:54:19 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:19 | 200 |  149.908375ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:54:19 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:19 | 200 |  233.893207ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:54:20 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:20 | 200 |  1.134658459s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:54:25 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:54:25 | 200 |  4.419488004s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 17:56:52 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:56:52 | 200 |  997.425782ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:56:53 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:56:53 | 200 |   976.49382ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:56:54 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:56:54 | 200 |  1.319597493s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:56:55 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:56:55 | 200 |  854.225048ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:56:56 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:56:56 | 200 |  978.069183ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:56:57 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:56:57 | 200 |  1.176739016s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 17:57:02 launchpad ollama[1550]: [GIN] 2025/04/24 - 17:57:02 | 200 |  4.568611013s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:01:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:40 | 200 |  285.525117ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:01:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:40 | 200 |  263.224302ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:01:42 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:42 | 200 |  1.284613852s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:01:42 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:42 | 200 |  183.780122ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:01:42 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:42 | 200 |  265.112546ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:01:44 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:44 | 200 |  1.308307094s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:01:49 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:01:49 | 200 |  5.455469807s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:02:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:39 | 200 |    190.2052ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:02:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:39 | 200 |  171.393986ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:02:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:40 | 200 |  1.207327682s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:02:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:40 | 200 |   85.131773ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:02:41 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:41 | 200 |  167.706202ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:02:41 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:41 | 200 |   740.56885ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:02:44 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:02:44 | 200 |  2.739863687s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:03:43 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:43 | 200 |  209.005892ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:03:44 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:44 | 200 |  192.816082ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:03:45 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:45 | 200 |  1.218627175s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:03:45 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:45 | 200 |  107.435739ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:03:45 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:45 | 200 |  189.349972ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:03:46 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:46 | 200 |  1.058925751s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:03:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:03:51 | 200 |  4.786284933s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:04:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:51 | 200 |  222.424954ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:04:52 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:52 | 200 |  197.341345ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:04:53 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:53 | 200 |   1.41089556s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:04:53 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:53 | 200 |  108.523858ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:04:53 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:53 | 200 |  191.368248ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:04:55 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:55 | 200 |  1.228077154s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:04:59 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:04:59 | 200 |  4.042051614s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:07:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:37 | 200 |  226.879035ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:07:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:37 | 200 |  199.111256ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:07:38 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:38 | 200 |  1.063590276s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:07:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:39 | 200 |   72.477831ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:07:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:39 | 200 |   195.27252ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:07:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:40 | 200 |  1.203461936s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:07:46 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:07:46 | 200 |   5.63723919s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:12:31 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:31 | 200 |  268.460203ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:12:31 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:31 | 200 |  255.364856ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:12:33 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:33 | 200 |  1.256656059s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:12:33 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:33 | 200 |  169.377662ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:12:33 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:33 | 200 |  249.408594ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:12:34 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:34 | 200 |  1.110234089s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:12:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:12:40 | 200 |  5.827003425s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:13:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:37 | 200 |  205.160617ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:13:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:37 | 200 |  188.095185ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:13:38 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:38 | 200 |  1.280872614s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:13:38 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:38 | 200 |  101.873219ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:13:39 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:39 | 200 |  184.533949ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:13:40 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:40 | 200 |  1.034372568s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:13:44 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:13:44 | 200 |  4.254397145s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:16:25 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:25 | 200 |  242.856859ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:16:25 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:25 | 200 |  223.365467ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:16:26 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:26 | 200 |  1.113844362s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:16:26 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:26 | 200 |  141.013523ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:16:26 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:26 | 200 |  218.619502ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:16:27 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:27 | 200 |  1.082765987s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:16:33 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:16:33 | 200 |  5.733372573s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.927-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8970436608 required="6.2 GiB"
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.927-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.0 GiB" free_swap="68.9 GiB"
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.928-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.929-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 46517"
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.929-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.929-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 18:39:05 launchpad ollama[1550]: time=2025-04-24T18:39:05.929-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 18:39:05 launchpad ollama[919991]: INFO [main] build info | build=0 commit="unknown" tid="140099898662912" timestamp=1745545145
+Apr 24 18:39:05 launchpad ollama[919991]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140099898662912" timestamp=1745545145 total_threads=16
+Apr 24 18:39:05 launchpad ollama[919991]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="46517" tid="140099898662912" timestamp=1745545145
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 18:39:05 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 18:39:06 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 18:39:06 launchpad ollama[1550]: time=2025-04-24T18:39:06.181-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 18:39:06 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 18:39:06 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 18:39:06 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 18:39:06 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 18:39:06 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 18:39:06 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 18:39:06 launchpad ollama[919991]: INFO [main] model loaded | tid="140099898662912" timestamp=1745545146
+Apr 24 18:39:07 launchpad ollama[1550]: time=2025-04-24T18:39:07.184-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 18:39:07 launchpad ollama[919991]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=3479 n_keep=24 n_left=2024 n_shift=1012 tid="140099898662912" timestamp=1745545147
+Apr 24 18:39:15 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:15 | 200 |  9.293126049s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:39:16 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:16 | 200 |   1.81402386s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:39:18 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:18 | 200 |  1.817578195s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:39:20 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:20 | 200 |  1.321442909s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:39:21 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:21 | 200 |  1.768898059s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:39:23 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:23 | 200 |  1.815065399s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:39:24 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:24 | 200 |   1.13415786s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:39:31 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:39:31 | 200 |  6.271196545s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:42:12 launchpad ollama[919991]: INFO [update_slots] input truncated | n_ctx=2048 n_erase=1645 n_keep=24 n_left=2024 n_shift=1012 tid="140099898662912" timestamp=1745545332
+Apr 24 18:42:20 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:20 | 200 |  8.327494275s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:42:20 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:20 | 200 |  225.432321ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:42:21 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:21 | 200 |  226.489769ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:42:22 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:22 | 200 |   1.16798263s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:42:22 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:22 | 200 |  144.278618ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:42:22 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:22 | 200 |  226.540793ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:42:24 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:24 | 200 |  1.284153205s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:42:27 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:42:27 | 200 |  3.720294838s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:43:34 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:34 | 200 |  1.024618794s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:43:35 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:35 | 200 |   179.97332ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:43:35 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:35 | 200 |   180.02421ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:43:36 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:36 | 200 |  1.090675027s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:43:36 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:36 | 200 |   98.967754ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:43:36 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:36 | 200 |  178.429106ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:43:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:37 | 200 |  857.260775ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:43:42 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:43:42 | 200 |  5.007495698s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:45:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:51 | 200 |  1.023029902s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:45:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:51 | 200 |  187.113902ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:45:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:51 | 200 |  183.078764ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:45:52 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:52 | 200 |  1.127197605s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:45:53 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:53 | 200 |  102.356162ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:45:53 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:53 | 200 |  185.324031ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:45:54 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:54 | 200 |  1.031321065s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:45:57 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:45:57 | 200 |  3.434281484s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:48:59 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:48:59 | 200 |  4.081209653s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:48:59 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:48:59 | 200 |   288.92447ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:49:00 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:49:00 | 200 |  286.772565ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:49:01 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:49:01 | 200 |  1.333809227s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:49:01 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:49:01 | 200 |  203.341443ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:49:01 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:49:01 | 200 |  285.595388ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:49:03 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:49:03 | 200 |  1.091997316s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:49:06 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:49:06 | 200 |  3.462818189s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:52:49 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:49 | 200 |  1.053412836s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:52:49 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:49 | 200 |  217.336173ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:52:50 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:50 | 200 |  216.634086ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:52:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:51 | 200 |  932.691712ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:52:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:51 | 200 |  135.776263ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:52:51 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:51 | 200 |  216.992277ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:52:52 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:52 | 200 |  1.014736642s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:52:57 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:52:57 | 200 |  5.298822783s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.186-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9081978880 required="6.2 GiB"
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.186-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.9 GiB" free_swap="68.9 GiB"
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.187-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.187-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 40493"
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.188-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.188-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.188-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 24 18:58:32 launchpad ollama[923081]: INFO [main] build info | build=0 commit="unknown" tid="140547275657216" timestamp=1745546312
+Apr 24 18:58:32 launchpad ollama[923081]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140547275657216" timestamp=1745546312 total_threads=16
+Apr 24 18:58:32 launchpad ollama[923081]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40493" tid="140547275657216" timestamp=1745546312
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 24 18:58:32 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 24 18:58:32 launchpad ollama[1550]: time=2025-04-24T18:58:32.439-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 24 18:58:32 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 24 18:58:32 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 24 18:58:32 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 24 18:58:32 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 24 18:58:32 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 24 18:58:33 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 24 18:58:33 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 24 18:58:33 launchpad ollama[923081]: INFO [main] model loaded | tid="140547275657216" timestamp=1745546313
+Apr 24 18:58:33 launchpad ollama[1550]: time=2025-04-24T18:58:33.441-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 24 18:58:37 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:37 | 200 |  5.044834878s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 18:58:54 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:54 | 200 |  285.297208ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:58:54 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:54 | 200 |   260.43005ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:58:55 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:55 | 200 |  952.027957ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:58:56 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:56 | 200 |  218.050077ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:58:56 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:56 | 200 |  261.842981ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:58:57 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:58:57 | 200 |  920.633754ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 18:59:04 launchpad ollama[1550]: [GIN] 2025/04/24 - 18:59:04 | 200 |  7.195869601s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 19:01:47 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:47 | 200 |  192.035927ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:01:47 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:47 | 200 |  180.653514ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:01:48 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:48 | 200 |  1.179824626s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:01:48 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:48 | 200 |   96.208666ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:01:48 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:48 | 200 |  178.822281ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:01:50 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:50 | 200 |  1.257093011s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:01:55 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:01:55 | 200 |  5.687962194s |       127.0.0.1 | POST     "/api/chat"
+Apr 24 19:03:11 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:11 | 200 |  187.164947ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:03:11 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:11 | 200 |  169.136714ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:03:13 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:13 | 200 |  1.328971141s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:03:13 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:13 | 200 |   84.264482ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:03:13 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:13 | 200 |  166.751689ms |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:03:15 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:15 | 200 |  1.536837721s |       127.0.0.1 | POST     "/api/embed"
+Apr 24 19:03:20 launchpad ollama[1550]: [GIN] 2025/04/24 - 19:03:20 | 200 |  4.833257563s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:06:02 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:06:02 | 200 |     656.506µs |       127.0.0.1 | GET      "/api/tags"
+Apr 25 10:06:02 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:06:02 | 200 |      34.553µs |       127.0.0.1 | GET      "/api/version"
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.315-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9075621888 required="6.2 GiB"
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.315-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.0 GiB" free_swap="68.9 GiB"
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.315-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.317-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 32785"
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.317-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.317-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.317-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 10:13:21 launchpad ollama[935186]: INFO [main] build info | build=0 commit="unknown" tid="140636594302976" timestamp=1745601201
+Apr 25 10:13:21 launchpad ollama[935186]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140636594302976" timestamp=1745601201 total_threads=16
+Apr 25 10:13:21 launchpad ollama[935186]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="32785" tid="140636594302976" timestamp=1745601201
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 25 10:13:21 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 10:13:21 launchpad ollama[1550]: time=2025-04-25T10:13:21.568-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 25 10:13:21 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 10:13:21 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 10:13:21 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 10:13:21 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 25 10:13:21 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 10:13:22 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 25 10:13:22 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 25 10:13:22 launchpad ollama[935186]: INFO [main] model loaded | tid="140636594302976" timestamp=1745601202
+Apr 25 10:13:22 launchpad ollama[1550]: time=2025-04-25T10:13:22.571-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 25 10:13:23 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:23 | 200 |  2.603573209s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:13:24 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:24 | 200 |  1.154762644s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:13:25 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:25 | 200 |  912.044721ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:13:27 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:27 | 200 |   1.11257606s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:13:28 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:28 | 200 |  1.195339685s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:13:29 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:29 | 200 |  948.435089ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:13:31 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:13:31 | 200 |  2.700882535s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.670-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8978890752 required="6.2 GiB"
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.670-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.1 GiB" free_swap="68.9 GiB"
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.671-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.672-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 44195"
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.672-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.672-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.672-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 10:19:23 launchpad ollama[936144]: INFO [main] build info | build=0 commit="unknown" tid="140383647281152" timestamp=1745601563
+Apr 25 10:19:23 launchpad ollama[936144]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140383647281152" timestamp=1745601563 total_threads=16
+Apr 25 10:19:23 launchpad ollama[936144]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="44195" tid="140383647281152" timestamp=1745601563
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 25 10:19:23 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 10:19:23 launchpad ollama[1550]: time=2025-04-25T10:19:23.924-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 25 10:19:23 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 25 10:19:23 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 10:19:23 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 10:19:23 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 10:19:23 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 10:19:24 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 25 10:19:24 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 25 10:19:24 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 25 10:19:24 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 10:19:24 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 25 10:19:24 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 25 10:19:24 launchpad ollama[936144]: INFO [main] model loaded | tid="140383647281152" timestamp=1745601564
+Apr 25 10:19:24 launchpad ollama[1550]: time=2025-04-25T10:19:24.927-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 25 10:19:25 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:25 | 200 |  1.663071879s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:19:25 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:25 | 200 |   202.88545ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:19:26 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:26 | 200 |   958.12173ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:19:26 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:26 | 200 |  159.322856ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:19:26 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:26 | 200 |  201.932104ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:19:27 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:27 | 200 |  804.617649ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:19:33 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:19:33 | 200 |   5.64176218s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.365-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=8985313280 required="6.2 GiB"
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.365-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.0 GiB" free_swap="68.9 GiB"
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.365-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.367-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 37745"
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.367-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.367-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.367-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 10:25:52 launchpad ollama[937189]: INFO [main] build info | build=0 commit="unknown" tid="139894329585664" timestamp=1745601952
+Apr 25 10:25:52 launchpad ollama[937189]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139894329585664" timestamp=1745601952 total_threads=16
+Apr 25 10:25:52 launchpad ollama[937189]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="37745" tid="139894329585664" timestamp=1745601952
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 25 10:25:52 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 10:25:52 launchpad ollama[1550]: time=2025-04-25T10:25:52.618-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 25 10:25:52 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 10:25:52 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 10:25:52 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 10:25:52 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 25 10:25:52 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 10:25:53 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 25 10:25:53 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 25 10:25:53 launchpad ollama[937189]: INFO [main] model loaded | tid="139894329585664" timestamp=1745601953
+Apr 25 10:25:53 launchpad ollama[1550]: time=2025-04-25T10:25:53.622-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.25 seconds"
+Apr 25 10:25:53 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:25:53 | 200 |  1.618330487s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:25:53 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:25:53 | 200 |  176.006639ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:25:54 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:25:54 | 200 |  1.012885767s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:25:55 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:25:55 | 200 |   52.663758ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:25:55 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:25:55 | 200 |  133.618171ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:25:56 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:25:56 | 200 |  1.337080749s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:26:03 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:26:03 | 200 |  7.192816778s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:27:06 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:06 | 200 |  216.879351ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:27:07 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:07 | 200 |  202.280451ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:27:08 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:08 | 200 |  1.258996057s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:27:08 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:08 | 200 |   113.90707ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:27:08 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:08 | 200 |  195.061757ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:27:10 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:10 | 200 |  1.358440515s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:27:14 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:27:14 | 200 |  4.660747669s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:31:24 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:24 | 200 |  262.036491ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:31:24 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:24 | 200 |  237.489923ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:31:25 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:25 | 200 |  1.239453145s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:31:26 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:26 | 200 |  196.405222ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:31:26 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:26 | 200 |  236.354144ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:31:27 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:27 | 200 |  1.020126427s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 10:31:30 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:31:30 | 200 |  2.991535105s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:42:39 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:42:39 | 200 |      26.374µs |       127.0.0.1 | GET      "/api/version"
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.408-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.569-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="53.0 GiB" free_swap="68.9 GiB"
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.570-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.3 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.571-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 40315"
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.571-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.571-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.571-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 10:44:56 launchpad ollama[940509]: INFO [main] build info | build=0 commit="unknown" tid="139938947252224" timestamp=1745603096
+Apr 25 10:44:56 launchpad ollama[940509]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139938947252224" timestamp=1745603096 total_threads=16
+Apr 25 10:44:56 launchpad ollama[940509]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="40315" tid="139938947252224" timestamp=1745603096
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 25 10:44:56 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 25 10:44:56 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 10:44:56 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 10:44:56 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 10:44:56 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 25 10:44:56 launchpad ollama[1550]: time=2025-04-25T10:44:56.876-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_tensors: offloading 36 repeating layers to GPU
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_tensors: offloaded 36/41 layers to GPU
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 25 10:44:56 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 10:44:57 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 25 10:44:57 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 48
+Apr 25 10:44:57 launchpad ollama[940509]: INFO [main] model loaded | tid="139938947252224" timestamp=1745603097
+Apr 25 10:44:57 launchpad ollama[1550]: time=2025-04-25T10:44:57.880-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.31 seconds"
+Apr 25 10:45:32 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:45:32 | 200 | 35.885928419s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:45:32 launchpad ollama[1550]: time=2025-04-25T10:45:32.360-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 10:45:34 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:45:34 | 200 |  2.174434507s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:45:34 launchpad ollama[1550]: time=2025-04-25T10:45:34.577-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 10:45:37 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:45:37 | 200 |  2.734380952s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:50:17 launchpad ollama[1550]: time=2025-04-25T10:50:17.553-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 10:50:46 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:50:46 | 200 | 29.139636083s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:53:00 launchpad ollama[1550]: time=2025-04-25T10:53:00.792-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 10:53:13 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:53:13 | 200 |   12.4257238s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 10:53:55 launchpad ollama[1550]: time=2025-04-25T10:53:55.887-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 10:54:45 launchpad ollama[1550]: [GIN] 2025/04/25 - 10:54:45 | 200 | 50.075986347s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.698-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.865-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.9 GiB" free_swap="68.9 GiB"
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.865-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.866-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 36671"
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.866-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.866-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 11:02:02 launchpad ollama[1550]: time=2025-04-25T11:02:02.866-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 11:02:02 launchpad ollama[972082]: INFO [main] build info | build=0 commit="unknown" tid="139775431499776" timestamp=1745604122
+Apr 25 11:02:02 launchpad ollama[972082]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="139775431499776" timestamp=1745604122 total_threads=16
+Apr 25 11:02:02 launchpad ollama[972082]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="36671" tid="139775431499776" timestamp=1745604122
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 25 11:02:02 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 25 11:02:02 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 25 11:02:02 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 11:02:02 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 11:02:02 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 11:02:02 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 11:02:03 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: time=2025-04-25T11:02:03.167-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 11:02:03 launchpad ollama[1550]: llm_load_tensors: offloading 36 repeating layers to GPU
+Apr 25 11:02:03 launchpad ollama[1550]: llm_load_tensors: offloaded 36/41 layers to GPU
+Apr 25 11:02:03 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 11:02:03 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 25 11:02:03 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 48
+Apr 25 11:02:03 launchpad ollama[972082]: INFO [main] model loaded | tid="139775431499776" timestamp=1745604123
+Apr 25 11:02:04 launchpad ollama[1550]: time=2025-04-25T11:02:04.170-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 25 11:02:20 launchpad ollama[1550]: [GIN] 2025/04/25 - 11:02:20 | 200 | 18.013133037s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 11:04:23 launchpad ollama[1550]: time=2025-04-25T11:04:23.808-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 11:04:47 launchpad ollama[1550]: [GIN] 2025/04/25 - 11:04:47 | 200 | 24.172131361s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 11:05:50 launchpad ollama[1550]: time=2025-04-25T11:05:50.437-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 11:06:04 launchpad ollama[1550]: [GIN] 2025/04/25 - 11:06:04 | 200 | 14.112729529s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 11:07:24 launchpad ollama[1550]: [GIN] 2025/04/25 - 11:07:24 | 200 |      28.291µs |       127.0.0.1 | GET      "/api/version"
+Apr 25 11:09:20 launchpad ollama[1550]: time=2025-04-25T11:09:20.794-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 11:11:22 launchpad ollama[1550]: [GIN] 2025/04/25 - 11:11:22 | 200 |          2m2s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 12:40:42 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:40:42 | 200 |      16.444µs |       127.0.0.1 | HEAD     "/"
+Apr 25 12:40:42 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:40:42 | 200 |      46.139µs |       127.0.0.1 | GET      "/api/ps"
+Apr 25 12:41:27 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:41:27 | 200 |     582.632µs |       127.0.0.1 | GET      "/api/tags"
+Apr 25 12:41:27 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:41:27 | 200 |      26.285µs |       127.0.0.1 | GET      "/api/version"
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.687-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.854-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.4 GiB" free_swap="68.9 GiB"
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.854-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=41 layers.offload=36 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="9.2 GiB" memory.required.partial="8.3 GiB" memory.required.kv="1.6 GiB" memory.required.allocations="[8.3 GiB]" memory.weights.total="8.2 GiB" memory.weights.repeating="8.1 GiB" memory.weights.nonrepeating="128.3 MiB" memory.graph.full="204.0 MiB" memory.graph.partial="244.1 MiB"
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.855-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 36 --parallel 1 --port 42199"
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.856-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.856-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 12:42:07 launchpad ollama[1550]: time=2025-04-25T12:42:07.856-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 12:42:07 launchpad ollama[1049016]: INFO [main] build info | build=0 commit="unknown" tid="140557161463808" timestamp=1745610127
+Apr 25 12:42:07 launchpad ollama[1049016]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140557161463808" timestamp=1745610127 total_threads=16
+Apr 25 12:42:07 launchpad ollama[1049016]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="42199" tid="140557161463808" timestamp=1745610127
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /var/lib/ollama/models/blobs/sha256-e73cc17c718156e5ad34b119eb363e2c10389a503673f9c36144c42dfde8334c (version GGUF V2)
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = codellama
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   4:                          llama.block_count u32              = 40
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 40
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 40
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  11:                          general.file_type u32              = 2
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32016]   = ["", "", "", "<0x00>", "<...
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32016]   = [0.000000, 0.000000, 0.000000, 0.0000...
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32016]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - kv  19:               general.quantization_version u32              = 2
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - type  f32:   81 tensors
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - type q4_0:  281 tensors
+Apr 25 12:42:07 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 3
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.1686 MB
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V2
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: vocab type       = SPM
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 32016
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 16384
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 5120
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 40
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_head           = 40
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 40
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 1
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 5120
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 5120
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 13824
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 1000000.0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 16384
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: model type       = 13B
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: model params     = 13.02 B
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: model size       = 6.86 GiB (4.53 BPW)
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: general.name     = codellama
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 1 ''
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 2 ''
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: UNK token        = 0 ''
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: LF token         = 13 '<0x0A>'
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: PRE token        = 32007 '▁
'
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: SUF token        = 32008 '▁'
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: MID token        = 32009 '▁'
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 32010 '▁'
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_print_meta: max token length = 48
+Apr 25 12:42:07 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 12:42:07 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 12:42:07 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 12:42:07 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 12:42:07 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.34 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: time=2025-04-25T12:42:08.154-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 12:42:08 launchpad ollama[1550]: llm_load_tensors: offloading 36 repeating layers to GPU
+Apr 25 12:42:08 launchpad ollama[1550]: llm_load_tensors: offloaded 36/41 layers to GPU
+Apr 25 12:42:08 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =  7024.00 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  6127.03 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 2048
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 1000000.0
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 12:42:08 launchpad ollama[1550]: llama_kv_cache_init:  CUDA_Host KV buffer size =   160.00 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1440.00 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     0.14 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   234.06 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1286
+Apr 25 12:42:08 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 48
+Apr 25 12:42:08 launchpad ollama[1049016]: INFO [main] model loaded | tid="140557161463808" timestamp=1745610128
+Apr 25 12:42:09 launchpad ollama[1550]: time=2025-04-25T12:42:09.158-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.30 seconds"
+Apr 25 12:42:14 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:42:14 | 200 |       14.74µs |       127.0.0.1 | HEAD     "/"
+Apr 25 12:42:14 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:42:14 | 200 |      16.628µs |       127.0.0.1 | GET      "/api/ps"
+Apr 25 12:42:23 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:42:23 | 200 | 15.714145874s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 12:42:58 launchpad ollama[1550]: time=2025-04-25T12:42:58.623-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 12:43:14 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:43:14 | 200 | 15.942858189s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 12:45:12 launchpad ollama[1550]: time=2025-04-25T12:45:12.245-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 12:45:38 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:45:38 | 200 |      19.429µs |       127.0.0.1 | HEAD     "/"
+Apr 25 12:45:38 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:45:38 | 200 |      24.797µs |       127.0.0.1 | GET      "/api/ps"
+Apr 25 12:46:31 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:46:31 | 200 |         1m19s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 12:47:38 launchpad ollama[1550]: time=2025-04-25T12:47:38.974-07:00 level=WARN source=types.go:509 msg="invalid option provided" option=rope_frequency_base
+Apr 25 12:48:01 launchpad ollama[1550]: [GIN] 2025/04/25 - 12:48:01 | 200 | 22.109538778s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 13:23:40 launchpad ollama[1550]: [GIN] 2025/04/25 - 13:23:40 | 200 |      15.125µs |       127.0.0.1 | HEAD     "/"
+Apr 25 13:23:40 launchpad ollama[1550]: [GIN] 2025/04/25 - 13:23:40 | 200 |       7.969µs |       127.0.0.1 | GET      "/api/ps"
+Apr 25 13:23:52 launchpad ollama[1550]: time=2025-04-25T13:23:52.093-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9049407488 required="6.2 GiB"
+Apr 25 13:23:52 launchpad ollama[1550]: time=2025-04-25T13:23:52.093-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="52.1 GiB" free_swap="68.9 GiB"
+Apr 25 13:23:52 launchpad ollama[1550]: time=2025-04-25T13:23:52.094-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 25 13:23:52 launchpad ollama[1550]: time=2025-04-25T13:23:52.095-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 35027"
+Apr 25 13:23:52 launchpad ollama[1550]: time=2025-04-25T13:23:52.095-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 13:23:52 launchpad ollama[1550]: time=2025-04-25T13:23:52.095-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 13:23:52 launchpad ollama[1550]: time=2025-04-25T13:23:52.095-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 13:23:52 launchpad ollama[1091977]: INFO [main] build info | build=0 commit="unknown" tid="140093651165184" timestamp=1745612632
+Apr 25 13:23:52 launchpad ollama[1091977]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140093651165184" timestamp=1745612632 total_threads=16
+Apr 25 13:23:52 launchpad ollama[1091977]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="35027" tid="140093651165184" timestamp=1745612632
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 25 13:23:52 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 13:23:52 launchpad ollama[1550]: time=2025-04-25T13:23:52.347-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 25 13:23:52 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 13:23:52 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 13:23:52 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 13:23:52 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 25 13:23:52 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 25 13:23:53 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 25 13:23:53 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 13:23:53 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 13:23:53 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 13:23:53 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 25 13:23:53 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 13:23:53 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 25 13:23:53 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 25 13:23:53 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 25 13:23:53 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 25 13:23:53 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 13:23:53 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 25 13:23:53 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 25 13:23:53 launchpad ollama[1091977]: INFO [main] model loaded | tid="140093651165184" timestamp=1745612633
+Apr 25 13:23:53 launchpad ollama[1550]: time=2025-04-25T13:23:53.351-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 25 13:23:53 launchpad ollama[1550]: [GIN] 2025/04/25 - 13:23:53 | 200 |  1.591997713s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 13:23:53 launchpad ollama[1550]: [GIN] 2025/04/25 - 13:23:53 | 200 |  154.915331ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 13:23:54 launchpad ollama[1550]: [GIN] 2025/04/25 - 13:23:54 | 200 |  1.141591253s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 13:23:54 launchpad ollama[1550]: [GIN] 2025/04/25 - 13:23:54 | 200 |   34.377066ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 13:23:55 launchpad ollama[1550]: [GIN] 2025/04/25 - 13:23:55 | 200 |  115.192206ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 13:23:55 launchpad ollama[1550]: [GIN] 2025/04/25 - 13:23:55 | 200 |      14.037µs |       127.0.0.1 | HEAD     "/"
+Apr 25 13:23:55 launchpad ollama[1550]: [GIN] 2025/04/25 - 13:23:55 | 200 |      16.753µs |       127.0.0.1 | GET      "/api/ps"
+Apr 25 13:23:56 launchpad ollama[1550]: [GIN] 2025/04/25 - 13:23:56 | 200 |  1.254187497s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 13:23:58 launchpad ollama[1550]: [GIN] 2025/04/25 - 13:23:58 | 200 |  2.104877559s |       127.0.0.1 | POST     "/api/chat"
+Apr 25 14:00:50 launchpad ollama[1550]: time=2025-04-25T14:00:50.139-07:00 level=INFO source=sched.go:714 msg="new model will fit in available VRAM in single GPU, loading" model=/var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa gpu=GPU-34355352-1d3b-7ef2-9549-a8ec2250eb6e parallel=4 available=9046589440 required="6.2 GiB"
+Apr 25 14:00:50 launchpad ollama[1550]: time=2025-04-25T14:00:50.139-07:00 level=INFO source=server.go:103 msg="system memory" total="62.6 GiB" free="51.6 GiB" free_swap="68.9 GiB"
+Apr 25 14:00:50 launchpad ollama[1550]: time=2025-04-25T14:00:50.140-07:00 level=INFO source=memory.go:326 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+Apr 25 14:00:50 launchpad ollama[1550]: time=2025-04-25T14:00:50.141-07:00 level=INFO source=server.go:388 msg="starting llama server" cmd="/tmp/ollama568753210/runners/cuda_v12/ollama_llama_server --model /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 45357"
+Apr 25 14:00:50 launchpad ollama[1550]: time=2025-04-25T14:00:50.141-07:00 level=INFO source=sched.go:449 msg="loaded runners" count=1
+Apr 25 14:00:50 launchpad ollama[1550]: time=2025-04-25T14:00:50.141-07:00 level=INFO source=server.go:587 msg="waiting for llama runner to start responding"
+Apr 25 14:00:50 launchpad ollama[1550]: time=2025-04-25T14:00:50.142-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server error"
+Apr 25 14:00:50 launchpad ollama[1106863]: INFO [main] build info | build=0 commit="unknown" tid="140585139654656" timestamp=1745614850
+Apr 25 14:00:50 launchpad ollama[1106863]: INFO [main] system info | n_threads=8 n_threads_batch=8 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="140585139654656" timestamp=1745614850 total_threads=16
+Apr 25 14:00:50 launchpad ollama[1106863]: INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="15" port="45357" tid="140585139654656" timestamp=1745614850
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /var/lib/ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa (version GGUF V3 (latest))
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv   0:                       general.architecture str              = llama
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv   2:                          llama.block_count u32              = 32
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv  10:                          general.file_type u32              = 2
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128009
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - kv  21:               general.quantization_version u32              = 2
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - type  f32:   65 tensors
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - type q4_0:  225 tensors
+Apr 25 14:00:50 launchpad ollama[1550]: llama_model_loader: - type q6_K:    1 tensors
+Apr 25 14:00:50 launchpad ollama[1550]: time=2025-04-25T14:00:50.393-07:00 level=INFO source=server.go:621 msg="waiting for server to become available" status="llm server loading model"
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_vocab: special tokens cache size = 256
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_vocab: token to piece cache size = 0.8000 MB
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: format           = GGUF V3 (latest)
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: arch             = llama
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: vocab type       = BPE
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_vocab          = 128256
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_merges         = 280147
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: vocab_only       = 0
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_ctx_train      = 8192
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_embd           = 4096
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_layer          = 32
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_head           = 32
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_head_kv        = 8
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_rot            = 128
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_swa            = 0
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_k    = 128
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_head_v    = 128
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_gqa            = 4
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_k_gqa     = 1024
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_embd_v_gqa     = 1024
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: f_norm_eps       = 0.0e+00
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: f_logit_scale    = 0.0e+00
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_ff             = 14336
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_expert         = 0
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_expert_used    = 0
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: causal attn      = 1
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: pooling type     = 0
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: rope type        = 0
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: rope scaling     = linear
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: freq_base_train  = 500000.0
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: freq_scale_train = 1
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: n_ctx_orig_yarn  = 8192
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: rope_finetuned   = unknown
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_conv       = 0
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_inner      = 0
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: ssm_d_state      = 0
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_rank      = 0
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: ssm_dt_b_c_rms   = 0
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: model type       = 8B
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: model ftype      = Q4_0
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: model params     = 8.03 B
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW)
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: LF token         = 128 'Ä'
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_print_meta: max token length = 256
+Apr 25 14:00:50 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+Apr 25 14:00:50 launchpad ollama[1550]: ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+Apr 25 14:00:50 launchpad ollama[1550]: ggml_cuda_init: found 1 CUDA devices:
+Apr 25 14:00:50 launchpad ollama[1550]:   Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_tensors: ggml ctx size =    0.27 MiB
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_tensors: offloading 32 repeating layers to GPU
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_tensors: offloading non-repeating layers to GPU
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_tensors: offloaded 33/33 layers to GPU
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_tensors:        CPU buffer size =   281.81 MiB
+Apr 25 14:00:50 launchpad ollama[1550]: llm_load_tensors:      CUDA0 buffer size =  4155.99 MiB
+Apr 25 14:00:51 launchpad ollama[1550]: llama_new_context_with_model: n_ctx      = 8192
+Apr 25 14:00:51 launchpad ollama[1550]: llama_new_context_with_model: n_batch    = 512
+Apr 25 14:00:51 launchpad ollama[1550]: llama_new_context_with_model: n_ubatch   = 512
+Apr 25 14:00:51 launchpad ollama[1550]: llama_new_context_with_model: flash_attn = 0
+Apr 25 14:00:51 launchpad ollama[1550]: llama_new_context_with_model: freq_base  = 500000.0
+Apr 25 14:00:51 launchpad ollama[1550]: llama_new_context_with_model: freq_scale = 1
+Apr 25 14:00:51 launchpad ollama[1550]: llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+Apr 25 14:00:51 launchpad ollama[1550]: llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+Apr 25 14:00:51 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host  output buffer size =     2.02 MiB
+Apr 25 14:00:51 launchpad ollama[1550]: llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB
+Apr 25 14:00:51 launchpad ollama[1550]: llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB
+Apr 25 14:00:51 launchpad ollama[1550]: llama_new_context_with_model: graph nodes  = 1030
+Apr 25 14:00:51 launchpad ollama[1550]: llama_new_context_with_model: graph splits = 2
+Apr 25 14:00:51 launchpad ollama[1106863]: INFO [main] model loaded | tid="140585139654656" timestamp=1745614851
+Apr 25 14:00:51 launchpad ollama[1550]: time=2025-04-25T14:00:51.396-07:00 level=INFO source=server.go:626 msg="llama runner started in 1.26 seconds"
+Apr 25 14:00:51 launchpad ollama[1550]: [GIN] 2025/04/25 - 14:00:51 | 200 |  1.593385331s |       127.0.0.1 | POST     "/api/embed"
+Apr 25 14:00:51 launchpad ollama[1550]: [GIN] 2025/04/25 - 14:00:51 | 200 |  156.720407ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 14:00:52 launchpad ollama[1550]: [GIN] 2025/04/25 - 14:00:52 | 200 |  930.819478ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 14:00:52 launchpad ollama[1550]: [GIN] 2025/04/25 - 14:00:52 | 200 |    73.64871ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 14:00:52 launchpad ollama[1550]: [GIN] 2025/04/25 - 14:00:52 | 200 |  157.318942ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 14:00:53 launchpad ollama[1550]: [GIN] 2025/04/25 - 14:00:53 | 200 |  914.613947ms |       127.0.0.1 | POST     "/api/embed"
+Apr 25 14:00:56 launchpad ollama[1550]: [GIN] 2025/04/25 - 14:00:56 | 200 |  2.152146818s |       127.0.0.1 | POST     "/api/chat"
diff --git a/flakes/tensorflow/requirements.txt.bak b/flakes/tensorflow/requirements.txt.bak
new file mode 100644
index 0000000..8cfc001
--- /dev/null
+++ b/flakes/tensorflow/requirements.txt.bak
@@ -0,0 +1,2 @@
+tensorflow==2.13.0
+tensorboard==2.18.0
\ No newline at end of file
diff --git a/flakes/tensorflow/shell.nix b/flakes/tensorflow/shell.nix
new file mode 100644
index 0000000..264f1f7
--- /dev/null
+++ b/flakes/tensorflow/shell.nix
@@ -0,0 +1,41 @@
+with import  {
+  config.sudaSupport = true;
+  config.allowUnfree = true;
+};
+let
+  python3 = pkgs.python311;
+in
+mkShell {
+  name = "tensorflow-cuda-shell";
+  buildInputs = with python3.pkgs; [
+    pip
+    numpy
+    setuptools
+    virtualenv
+    tensorflow tensorboard
+  ];
+  shellHook = ''
+    export CUDA_PATH=${pkgs.cudatoolkit}
+    export LD_LIBRARY_PATH=${pkgs.stdenv.cc.cc.lib}/lib:${pkgs.cudaPackages_11.cudatoolkit}/lib:${pkgs.cudaPackages_11.cudnn}/lib:${pkgs.cudaPackages_11.cudatoolkit.lib}/lib:$LD_LIBRARY_PATH
+    export EXTRA_LDFLAGS="-L/lib -L${pkgs.linuxPackages.nvidia_x11}/lib"
+    export EXTRA_CCFLAGS="-I/usr/include"
+    alias pip="PIP_PREFIX='$(pwd)/_build/pip_packages' TMPDIR='$HOME' \pip"
+    export PYTHONPATH="$(pwd)/_build/pip_packages/lib/python3.7/site-packages:$PYTHONPATH"
+    export PATH="$(pwd)/_build/pip_packages/bin:$PATH"
+    unset SOURCE_DATE_EPOCH
+
+    # Use bash instead of zsh
+    export SHELL=${pkgs.bash}/bin/bash
+
+    # set up a virtuial environment
+    export VIRTUAL_ENV=$PWD/venv
+    if [ ! -d "$VIRTUAL_ENV" ]; then
+      virtualenv "$VIRTUAL_ENV"
+      source "$VIRTUAL_ENV/bin/activate"
+      python -m pip install --upgrade pip
+      # python -m pip install -r requirements.txt
+    else
+      source "$VIRTUAL_ENV/bin/activate"
+    fi
+  '';
+}
\ No newline at end of file
diff --git a/flakes/tensorflow/tf.py b/flakes/tensorflow/tf.py
new file mode 100644
index 0000000..4676dbf
--- /dev/null
+++ b/flakes/tensorflow/tf.py
@@ -0,0 +1,23 @@
+import tensorflow as tf
+import re
+
+# Create a summary writer
+logdir = "logs/"
+writer = tf.summary.create_file_writer(logdir)
+
+# Read the journalctl logs
+with open("ollama.log", "r") as f:
+    for line in f:
+        # Example: Parse the log line (you may need to adjust this regex)
+        match = re.search(r'(\d+-\d+-\d+ \d+:\d+:\d+).*?(\w+): (.*)', line)
+        if match:
+            timestamp = match.group(1)
+            log_level = match.group(2)
+            message = match.group(3)
+
+            # Write to TensorBoard
+            with writer.as_default():
+                tf.summary.text("log_message", f"{timestamp} [{log_level}] {message}", step=0)
+
+# Close the writer
+writer.close()
diff --git a/hardware-configuration.nix b/hardware-configuration.nix
index 114f12b..e20259b 100644
--- a/hardware-configuration.nix
+++ b/hardware-configuration.nix
@@ -4,9 +4,9 @@
 { config, lib, pkgs, modulesPath, ... }:
 
 {
-  imports =
-    [ (modulesPath + "/installer/scan/not-detected.nix")
-    ];
+  imports = [
+    (modulesPath + "/installer/scan/not-detected.nix")
+  ];
 
   # Bootloader.
   # boot.loader.systemd-boot.enable = true; Disabled for Grub
@@ -55,17 +55,18 @@
 
   nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
   hardware = {
-    cpu.intel.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
     bluetooth.enable = true; # enables support for Bluetooth
     bluetooth.powerOnBoot = true; # powers up the default Bluetooth controller on boot
+    cpu.intel.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
+    # opengl.setLdLibraryPath = true; # Invalid option
+    nvidia-container-toolkit = {
+      enable = true;
+    };
     pulseaudio = {
       enable = false;
       daemon = {
         logLevel = "debug";
       };
     };
-    nvidia-container-toolkit = {
-      enable = true;
-    };
   };
 }
diff --git a/hm/home.nix b/hm/home.nix
index fd3c1ca..bff6f69 100644
--- a/hm/home.nix
+++ b/hm/home.nix
@@ -58,6 +58,7 @@ let
     # Kitty
     icat="kitten icat";
     s="kitten ssh";
+    kitty-logs="kitty --class='kitty-logs' journalctl -f";
 
     # init_0="sudo systemctl isolate emergency.target";
     # init_1="sudo systemctl isolate rescue.target";
@@ -170,12 +171,37 @@ in
     libreoffice
     bottles winePackages.wayland
 
-    mpv vlc
+    vlc
   ] ++ (import ./pkgs { inherit unstablePkgs pkgs system zen-browser; });
 
 
   programs = {
     home-manager.enable = true;
+    direnv = {
+      enable = true;
+      package = pkgs.direnv;
+      enableBashIntegration = true;
+      enableZshIntegration = true;
+      nix-direnv = {
+        enable = true;
+        package = pkgs.nix-direnv;
+      };
+    };
+    mpv = {
+      enable = true;
+      package = pkgs.mpv-unwrapped.wrapper { mpv = pkgs.mpv-unwrapped.override { vapoursynthSupport = true; }; youtubeSupport = true; };
+      bindings = {
+        WHEEL_UP = "seek 10";
+        WHEEL_DOWN = "seek -10";
+        "Alt+0" = "set window-scale 0.5";
+      };
+      # config = {
+
+      # };
+      # includes = [
+      #   "${config.home.homeDirectory}/.config/mpv/config.inc"
+      # ];
+    };
     ripgrep.enable = true;
     vscode = {
       enable = true;
@@ -188,18 +214,7 @@ in
         bbenoist.nix
       ];
     };
-    direnv = {
-      enable = true;
-      package = pkgs.direnv;
-      enableBashIntegration = true;
-      enableZshIntegration = true;
-      nix-direnv = {
-        enable = true;
-        package = pkgs.nix-direnv;
-      };
-    };
   };
-
   services = {
     blueman-applet.enable = true;
     # keybase.enable = true;
@@ -216,7 +231,8 @@ in
       startInBackground = false;
     };
     udiskie = {
-      enable = false;
+      enable = true;
+      notify = true;
       settings = {
         program_options = {
           tray = "auto";
diff --git a/hm/modules/hypr.nix b/hm/modules/hypr.nix
index 01f7dbd..bf32aa1 100644
--- a/hm/modules/hypr.nix
+++ b/hm/modules/hypr.nix
@@ -163,45 +163,44 @@
     enable = true;
     layout = [
           {
-              label = "lock - l";
+              label = "lock";
               action = "pkill hyprlock && loginctl lock-session";
-              text = "Lock";
+              text = "Lock - l";
               keybind = "l";
               circular = false;
           }
           {
-              label = "hibernate - h";
+              label = "hibernate";
               action = "systemctl hibernate";
-              text = "Hibernate";
+              text = "Hibernate - h";
               keybind = "h";
               circular = false;
           }
           {
-              label = "logout - e";
+              label = "logout";
               action = "${config.home.homeDirectory}/bin/hypr-logout.sh";
-              # action = "loginctl terminate-session $XDG_SESSION_ID";
-              text = "Logout";
+              text = "Logout - e";
               keybind = "e";
               circular = false;
           }
           {
-            label = "shutdown - s";
+            label = "shutdown";
             action = "systemctl poweroff";
-            text = "Shutdown";
+            text = "Shutdown - l";
             keybind = "s";
             circular = false;
           }
           {
-              label = "suspend - u";
+              label = "suspend";
               action = "systemctl suspend";
-              text = "Suspend";
+              text = "Suspend - u";
               keybind = "u";
               circular = false;
           }
           {
-              label = "reboot - r";
+              label = "reboot";
               action = "systemctl reboot";
-              text = "Reboot";
+              text = "Reboot - r";
               keybind = "r";
               circular = false;
           }
diff --git a/hm/pkgs/default.nix b/hm/pkgs/default.nix
index 5cfee45..0a40f6e 100644
--- a/hm/pkgs/default.nix
+++ b/hm/pkgs/default.nix
@@ -7,6 +7,6 @@ builtins.concatLists
   (import ./llm-packages.nix { inherit pkgs; })
   (import ./rescue-packages.nix { inherit pkgs; })
   (import ./theme-packages.nix { inherit pkgs unstablePkgs; })
-  # (import ./utilities.nix { inherit pkgs; })
+  (import ./utilities.nix { inherit pkgs; })
   (import ./wm-packages.nix { inherit pkgs; })
 ]
\ No newline at end of file
diff --git a/hm/pkgs/inet-packages.nix b/hm/pkgs/inet-packages.nix
index d0176a3..2018cd3 100644
--- a/hm/pkgs/inet-packages.nix
+++ b/hm/pkgs/inet-packages.nix
@@ -11,7 +11,7 @@ with pkgs;
   # mopidy mopidy-tidal mopidy-musicbox-webclient gst_all_1.gstreamer gst_all_1.gst-plugins-bad
 
   # Browsers
-  # firefox-devedition
+  firefox-devedition-bin
   w3m
   ungoogled-chromium
   librewolf
diff --git a/hm/pkgs/utilities.nix b/hm/pkgs/utilities.nix
index fedf589..2452ca1 100644
--- a/hm/pkgs/utilities.nix
+++ b/hm/pkgs/utilities.nix
@@ -1,5 +1,6 @@
 { pkgs, ... }:
 with pkgs;
 [
-  bacula
-]
\ No newline at end of file
+  pv
+  sshpass
+]
diff --git a/modules/containers.nix b/modules/containers.nix
index 38ea6b5..ff211b6 100644
--- a/modules/containers.nix
+++ b/modules/containers.nix
@@ -245,37 +245,37 @@
 
       config = { config, pkgs, ... }:
       {
-        networking.firewall = {
-          enable = false;
-          allowedTCPPorts = [ 80 5173 ];
-          allowedTCPPortRanges = [ { from = 8000; to = 9000; } ];
-          # allowedUDPPorts = [ 53 ];
-        };
         networking = {
+          firewall = {
+            enable = false;
+            allowedTCPPorts = [ 22 80 5173 ];
+            allowedTCPPortRanges = [ { from = 8000; to = 9000; } ];
+          };
           enableIPv6 = false;
-          extraHosts = ''
-            140.82.116.6   api.github.com
-          '';
+          nameservers = [ "192.168.12.25" ];
+          # extraHosts = ''
+          #   140.82.116.6   api.github.com
+          # '';
         };
 
-        services.nginx = {
-          enable = false;
-          package = pkgs.nginx;
-          # user = "www-data";
-          virtualHosts.localhost = {
-            root = "/var/www/site";
-            locations."~ \\.php$".extraConfig = ''
-              fastcgi_pass  unix:${config.services.phpfpm.pools.mypool.socket};
-              fastcgi_index index.php;
-            '';
-            locations."/robots.txt" = {
-              extraConfig = ''
-                rewrite ^/(.*)  $1;
-                return 200 "User-agent: *\nDisallow: /";
-              '';
-            };
-          };
-        };
+        # services.nginx = {
+        #   enable = false;
+        #   package = pkgs.nginx;
+        #   # user = "www-data";
+        #   virtualHosts.localhost = {
+        #     root = "/var/www/site";
+        #     locations."~ \\.php$".extraConfig = ''
+        #       fastcgi_pass  unix:${config.services.phpfpm.pools.mypool.socket};
+        #       fastcgi_index index.php;
+        #     '';
+        #     locations."/robots.txt" = {
+        #       extraConfig = ''
+        #         rewrite ^/(.*)  $1;
+        #         return 200 "User-agent: *\nDisallow: /";
+        #       '';
+        #     };
+        #   };
+        # };
 
         services.mysql = {
           enable = true;
@@ -284,19 +284,31 @@
           # configFile = '''';
         };
 
-        services.phpfpm = {
-          phpPackage = pkgs.php83;
-          pools.mypool = {
-            user = "nobody";
-            settings = {
-              "pm" = "dynamic";
-              "listen.owner" = config.services.nginx.user;
-              "pm.max_children" = 5;
-              "pm.start_servers" = 2;
-              "pm.min_spare_servers" = 1;
-              "pm.max_spare_servers" = 3;
-              "pm.max_requests" = 500;
-            };
+        # services.phpfpm = {
+        #   phpPackage = pkgs.php83;
+        #   pools.mypool = {
+        #     user = "nobody";
+        #     settings = {
+        #       "pm" = "dynamic";
+        #       "listen.owner" = config.services.nginx.user;
+        #       "pm.max_children" = 5;
+        #       "pm.start_servers" = 2;
+        #       "pm.min_spare_servers" = 1;
+        #       "pm.max_spare_servers" = 3;
+        #       "pm.max_requests" = 500;
+        #     };
+        #   };
+        # };
+
+        services.openssh = {
+          enable = true;
+          ports = [ 22 ];
+          settings = {
+            PasswordAuthentication = true;
+            AllowUsers = [ "wayne" ]; # Allows all users by default. Can be [ "user1" "user2" ]
+            UseDns = true;
+            X11Forwarding = false;
+            PermitRootLogin = "no"; # "yes", "without-password", "prohibit-password", "forced-commands-only", "no"
           };
         };
 
@@ -318,8 +330,34 @@
           };
         };
 
+        programs.neovim.enable = true;
+        programs.tmux = {
+          enable = true;
+          terminal = "tmux-256color";
+          keyMode = "vi";
+          shortcut = "s";
+          extraConfig = ''
+            set -ag terminal-overrides ",xterm-256color:RGB"
+            set -g mouse on
+            # Resize pane key bindings
+            bind -r < resize-pane -L 5
+            bind -r > resize-pane -R 5
+            bind -r + resize-pane -U 5
+            bind -r - resize-pane -D 5
+
+            set-option -g status-position top
+          '';
+          plugins = [
+            pkgs.tmuxPlugins.weather
+            pkgs.tmuxPlugins.tmux-fzf
+            pkgs.tmuxPlugins.resurrect
+            pkgs.tmuxPlugins.nord
+          ];
+        };
+
         environment.systemPackages = with pkgs; [
-          php83 php83Packages.composer nodejs_22 vim git
+          # php83 php83Packages.composer
+          nodejs_22 vim git
         ];
 
         system.stateVersion = "24.11";
@@ -417,6 +455,10 @@
           hostPath = "/home/wayne/dev/www/whc/2025/budget";
           isReadOnly = false;
         };
+        "/var/www/jose" = {
+          hostPath = "/home/wayne/dev/www/jose";
+          isReadOnly = false;
+        };
       };
       # Testing to see if this even works
       # forwardPorts = [
@@ -550,8 +592,8 @@
           phpOptions = ''
             extension=${pkgs.php83Extensions.xdebug}/lib/php/extensions/xdebug.so
           '';
-          extraConfig = ''
-          '';
+          # extraConfig = ''
+          # '';
         };
 
         _module.args.pkgs-unstable = import inputs.nixpkgs-unstable {
diff --git a/modules/display.nix b/modules/display.nix
index deb5fef..770cb71 100644
--- a/modules/display.nix
+++ b/modules/display.nix
@@ -1,11 +1,11 @@
 { pkgs, config, ... }:
 {
   hardware.graphics = { # Renamed hardware.graphics
-  enable = true; # Renamed
-  package = pkgs.mesa.drivers; # Renamed
-  # 32bit Support
-  enable32Bit = true; # Renamed
-  # package32 = pkgsi686Linux.mesa.drivers;
+    enable = true; # Renamed
+    package = pkgs.mesa.drivers; # Renamed
+    # 32bit Support
+    enable32Bit = true; # Renamed
+    # package32 = pkgsi686Linux.mesa.drivers;
   };
 
   hardware.nvidia = {
diff --git a/modules/security.nix b/modules/security.nix
index 43b31a3..6c4656c 100644
--- a/modules/security.nix
+++ b/modules/security.nix
@@ -1,84 +1,90 @@
 { pkgs, ... }:
 {
-  security.polkit = {
-    enable = true;
-    extraConfig = ''
-      polkit.addRule(function (action, subject) {
-        if (
-          subject.isInGroup("users") &&
-          [
-            "org.freedesktop.login1.reboot",
-            "org.freedesktop.login1.reboot-multiple-sessions",
-            "org.freedesktop.login1.power-off",
-            "org.freedesktop.login1.power-off-multiple-sessions",
-          ].indexOf(action.id) !== -1
-        ) {
-          return polkit.Result.YES;
-        }
-      });
-      polkit.addRule(function(action, subject) {
-        var YES = polkit.Result.YES;
-        var permission = {
-          // required for udisks1:
-          "org.freedesktop.udisks.filesystem-mount": YES,
-          "org.freedesktop.udisks.luks-unlock": YES,
-          "org.freedesktop.udisks.drive-eject": YES,
-          "org.freedesktop.udisks.drive-detach": YES,
-          // required for udisks2:
-          "org.freedesktop.udisks2.filesystem-mount": YES,
-          "org.freedesktop.udisks2.encrypted-unlock": YES,
-          "org.freedesktop.udisks2.eject-media": YES,
-          "org.freedesktop.udisks2.power-off-drive": YES,
-          // required for udisks2 if using udiskie from another seat (e.g. systemd):
-          "org.freedesktop.udisks2.filesystem-mount-other-seat": YES,
-          "org.freedesktop.udisks2.filesystem-unmount-others": YES,
-          "org.freedesktop.udisks2.encrypted-unlock-other-seat": YES,
-          "org.freedesktop.udisks2.encrypted-unlock-system": YES,
-          "org.freedesktop.udisks2.eject-media-other-seat": YES,
-          "org.freedesktop.udisks2.power-off-drive-other-seat": YES
-        };
-        if (subject.isInGroup("storage")) {
-          return permission[action.id];
-        };
-      });
-      polkit.addRule(function(action, subject) {
-        if (action.id == "org.kde.kio.admin.commands" && subject.isInGroup("wheel")) {
-          return polkit.Result.YES; // No password prompt!
-        }
-      });
-    '';
-  };
-
-  security.pam.services = {
-   login.enableKwallet = true;
-   login.kwallet = {
-     enable = true;
-     package = pkgs.kdePackages.kwallet-pam;
-     # package = pkgs.plasma5Packages.kwallet-pam; # Comment for plasma6
-   };
-  sddm = {
-    enableKwallet = true;
-    text = ''
-      # Enable pam_kwallet5 for sddm
-      auth      optional      pam_kwallet5.so
-      session   optional      pam_kwallet5.so auto_start
-    '';
+  security = {
+    polkit = {
+      enable = true;
+      debug = true;
+      extraConfig = ''
+        polkit.addRule(function (action, subject) {
+          if (
+            subject.isInGroup("users") &&
+            [
+              "org.freedesktop.login1.reboot",
+              "org.freedesktop.login1.reboot-multiple-sessions",
+              "org.freedesktop.login1.power-off",
+              "org.freedesktop.login1.power-off-multiple-sessions",
+            ].indexOf(action.id) !== -1
+          ) {
+            return polkit.Result.YES;
+          }
+        });
+        polkit.addRule(function(action, subject) {
+          var YES = polkit.Result.YES;
+          var permission = {
+            // required for udisks1:
+            "org.freedesktop.udisks.filesystem-mount": YES,
+            "org.freedesktop.udisks.luks-unlock": YES,
+            "org.freedesktop.udisks.drive-eject": YES,
+            "org.freedesktop.udisks.drive-detach": YES,
+            // required for udisks2:
+            "org.freedesktop.udisks2.filesystem-mount": YES,
+            "org.freedesktop.udisks2.encrypted-unlock": YES,
+            "org.freedesktop.udisks2.eject-media": YES,
+            "org.freedesktop.udisks2.power-off-drive": YES,
+            // required for udisks2 if using udiskie from another seat (e.g. systemd):
+            "org.freedesktop.udisks2.filesystem-mount-other-seat": YES,
+            "org.freedesktop.udisks2.filesystem-unmount-others": YES,
+            "org.freedesktop.udisks2.encrypted-unlock-other-seat": YES,
+            "org.freedesktop.udisks2.encrypted-unlock-system": YES,
+            "org.freedesktop.udisks2.eject-media-other-seat": YES,
+            "org.freedesktop.udisks2.power-off-drive-other-seat": YES
+          };
+          if (subject.isInGroup("storage")) {
+            return permission[action.id];
+          };
+        });
+        polkit.addRule(function(action, subject) {
+          if (action.id == "org.kde.kio.admin.commands" && subject.isInGroup("wheel")) {
+            return polkit.Result.YES; // No password prompt!
+          }
+        });
+        polkit.addRule(function(action, subject) {
+          // Make sure to set { security.polkit.debug = true; } in configuration.nix
+          polkit.log("user " +  subject.user + " is attempting action " + action.id + " from PID " + subject.pid);
+        });
+      '';
+      adminIdentities = [
+        "unix-group:wheel"
+      ];
     };
-  };
-  security.pam.services.wayne.kwallet.enable = true;
-  security.pam.services.wayne.kwallet.package = pkgs.kdePackages.kwallet-pam;
-  # security.pam.services.wayne.kwallet.package = pkgs.plasma5Packages.kwallet-pam;
 
-  security.rtkit.enable = true;
+    pam.services = {
+       login.kwallet = {
+         enable = true;
+         package = pkgs.kdePackages.kwallet-pam;
+       };
+      sddm = {
+        enableKwallet = true;
+        text = ''
+          # Enable pam_kwallet5 for sddm
+          auth      optional      pam_kwallet5.so
+          session   optional      pam_kwallet5.so auto_start
+        '';
+        };
+      };
 
-  security.sudo = {
-    enable = true;
-    extraConfig = ''
-      %wheel ALL=(ALL) NOPASSWD: ${pkgs.input-remapper}/bin/input-remapper-service
-      %wheel ALL=(ALL) NOPASSWD: ${pkgs.input-remapper}/bin/input-remapper-control
-      %wheel ALL=(ALL) NOPASSWD: /run/wrappers/bin/systemctl restart display-manager
-      %wheel ALL=(ALL) NOPASSWD: /home/wayne/.nix-profile/bin/journalctl -f
-    '';
+    rtkit.enable = true;
+
+    sudo = {
+      enable = true;
+      extraConfig = ''
+        %wheel ALL=(ALL) NOPASSWD: ${pkgs.input-remapper}/bin/input-remapper-service
+        %wheel ALL=(ALL) NOPASSWD: ${pkgs.input-remapper}/bin/input-remapper-control
+        %wheel ALL=(ALL) NOPASSWD: ${pkgs.input-remapper}/bin/input-remapper-gtk
+        %wheel ALL=(ALL) NOPASSWD: /run/wrappers/bin/systemctl restart display-manager
+        %wheel ALL=(ALL) NOPASSWD: /home/wayne/.nix-profile/bin/journalctl -f
+      '';
+    };
   };
 
   age = {
@@ -96,6 +102,13 @@
       pia = {
         file = ../secrets/pia.age;
       };
+      ff-sync = {
+        file = ../secrets/ff-sync.age;
+      };
     };
   };
 }
+
+  # security.pam.services.wayne.kwallet.enable = true;
+  # security.pam.services.wayne.kwallet.package = pkgs.kdePackages.kwallet-pam;
+  # security.pam.services.wayne.kwallet.package = pkgs.plasma5Packages.kwallet-pam;
\ No newline at end of file
diff --git a/modules/users.nix b/modules/users.nix
index e50c4ea..1394e6d 100644
--- a/modules/users.nix
+++ b/modules/users.nix
@@ -29,6 +29,7 @@
           "audio"
           "pipewire"
           "media"
+          "vboxusers"
         ];
         openssh.authorizedKeys.keys = [ "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC6q75AXShVjzWerPFU3l0YXIKViNF4eyragL+RQIRwdNDMzW34GgxFhtyLsmRB2jsykf4gx76zSyJoii0K4uhS761w5oDXz9mqHnBzJutDTyltLyfgLsFKY8V2sOtSracWx4y6QrtmBR3MKV642kg5mufVEmCMVxBU9oVpIOJUSW+XNYBawaSkl5SjDMLI7flj0v9Hb2dZqjQigDM1+UHxTzcMoZnR6hAzoZpPNMGyZXtrgEotNywNdpxCqRV9T/KvKHSIVYc1umiDXc+5fyRo9v7CzvYY3MvRKWbgUfTVQRmR2kqwsGAsV5W+PEUb8qqmTjPVXusKxKlpGgYfsvIZOv0LgOvQMemga8psFIS9F4YQ1xuc092bKi1LteC+ZlabMLRMux6Wbhjci+v9wnLfTyBQc6wr3wDFJoELKyaLnI7Cn0KHn0k+BiYCCr2TjKrz9XztL490vprYtvVumNFZ3rxewGnZSn/2czaFfQgnXt+9xQGWWo9TJGE5jq1jThnsIbH3G9JIQ1WhomMq6DDRhGMX2ZKDk/M9/fk0XN5nvQyJNmbiHy5srW/VANx8R9jjsVkvX29MXeK6dzpR4ImywXggpYMeHB6moy06cJUu5WoR8aLS/rO5LbZM2j14AMQ2ZK79QfUJSdloA1/HHlmnx7YtzZOebD6BCR7xgasGRw== wayne"];
         packages = with pkgs; [
diff --git a/pkgs/app/servers/ff-sync.nix b/pkgs/app/servers/ff-sync.nix
new file mode 100644
index 0000000..b451aa4
--- /dev/null
+++ b/pkgs/app/servers/ff-sync.nix
@@ -0,0 +1,25 @@
+{ pkgs, config, ...}:
+
+{
+  services.mysql.package = pkgs.mariadb;
+
+  services.firefox-syncserver = {
+    enable = true;
+    package = pkgs.syncstorage-rs;
+    logLevel = "debug"; # error
+    secrets = builtins.toFile "sync-secrets" ''
+      SYNC_MASTER_SECRET=config.age.secrets.ff-sync.path;
+    '';
+    singleNode = {
+      enable = true;
+      hostname = "localhost";
+      url = "http://localhost:5000";
+    };
+    # database = {
+    #   createLocally = true;
+    #   host = "localhost";
+    #   user = "firefox-syncserver";
+    #   name = "firefox_syncserver";
+    # };
+  };
+}
\ No newline at end of file
diff --git a/pkgs/app/terminal/ghostty.nix b/pkgs/app/terminal/ghostty.nix
index 7c3ff28..ebe6880 100644
--- a/pkgs/app/terminal/ghostty.nix
+++ b/pkgs/app/terminal/ghostty.nix
@@ -12,6 +12,7 @@
       background-opacity = 0.65;
       background-blur = 5;
       font-size = 15;
+      app-notifications = "no-clipboard-copy";
     };
     installBatSyntax = true;
     installVimSyntax = true;
diff --git a/pkgs/virtualisation/vbox.nix b/pkgs/virtualisation/vbox.nix
index 1ce42e7..5370372 100644
--- a/pkgs/virtualisation/vbox.nix
+++ b/pkgs/virtualisation/vbox.nix
@@ -2,17 +2,17 @@
 {
   virtualisation.virtualbox = {
     host = {
-      enable = false;
+      enable = true;
       package = pkgs.virtualbox;
       enableExtensionPack = true;
       enableWebService = true;
       addNetworkInterface = true;
       enableHardening = true;
-      headless = false;
+      headless = true;
       enableKvm = false;
     };
     guest = {
-      enable = false;
+      enable = true;
       seamless = true;
       dragAndDrop = true;
       clipboard = true;
diff --git a/pkgs/wm/hyprland/hyprland.nix b/pkgs/wm/hyprland/hyprland.nix
index 23d50a7..5b0b7f9 100644
--- a/pkgs/wm/hyprland/hyprland.nix
+++ b/pkgs/wm/hyprland/hyprland.nix
@@ -6,6 +6,7 @@
     # package = hyprland.packages.stdenv.hostPlatform.system}.hyprland;
 #    xwayland.enable = true;
     systemd.enable = true;
+
     systemd.variables = ["--all"];
 
     # device {
@@ -13,6 +14,7 @@
     # };
     settings = {
       "$mod" = "SUPER";
+
       env = [
         "XDG_SESSION_TYPE,wayland"
         "XDG_SESSION_DESKTOP,Hyprland"
@@ -22,12 +24,13 @@
         "MOZ_ENABLE_WAYLAND,1"
         # "ANKI_WAYLAND,1"
 
-        # "QT_AUTO_SCREEN_SCALE_FACTOR,1"
-        # "QT_WAYLAND_DISABLE_WINDOWDECORATION,1"
-        # "QT_QPA_PLATFORM=wayland,xcb" # removed ,xcb CGPT
-        # "QT_QPA_PLATFORMTHEME,qt6ct"
+        "QT_AUTO_SCREEN_SCALE_FACTOR,1"
+        "QT_WAYLAND_DISABLE_WINDOWDECORATION,1"
+        "QT_QPA_PLATFORM=wayland,xcb" # removed ,xcb CGPT
+        "QT_QPA_PLATFORMTHEME,qt6ct"
+        "QT_QPA_PLATFORMTHEME,qt5ct"
 
-        # "GDK_SCALE,1"
+        "GDK_SCALE,1"
         "GDK_BACKEND,wayland,x11,*"
 
         # "GTK_THEME,FlatColor:dark"
@@ -35,7 +38,8 @@
 
         # "DISABLE_QT5_COMPAT,0"
         # "NIXOS_OZONE_WL,1"
-        # "ELECTRON_OZONE_PLATFORM_HINT,auto"
+        "OZONE_PLATFORM,wayland"
+        "ELECTRON_OZONE_PLATFORM_HINT,wayland"
         # "__GL_GSYNC_ALLOWED,0"
         # "__GL_VRR_ALLOWED,0"
         "__GLX_VENDOR_LIBRARY_NAME,nvidia"
@@ -49,7 +53,7 @@
         # "APPIMAGELAUNCHER_DISABLE,1"
         # "OZONE_PLATFORM,wayland"
         # "SDL_VIDEODRIVER,wayland"
-        # "CLUTTER_BACKEND,wayland"
+        "CLUTTER_BACKEND,wayland"
         "GBM_BACKEND,nvidia-drm"
         "LIBVA_DRIVER_NAME,nvidia"
         # "AQ_DRM_DEVICES,/dev/dri/card2/" # CHANGEME: Related to the GPU
@@ -64,10 +68,10 @@
 
         "HYPRLAND_TRACE=1"
 
-        # "QT_STYLE_OVERRIDE=kvantum"
+        "QT_STYLE_OVERRIDE=kvantum-dark"
         # "QT_QPA_PLATFORMTHEME = qt6ct" # qt5ct no display
-
       ];
+
       input = {
         kb_layout = "us";
         # kb_variant =;
@@ -85,19 +89,27 @@
 
         sensitivity = 0;
       };
+
       debug = { disable_logs = false; };
-      xwayland = { force_zero_scaling = true; };
+
+      xwayland = {
+        enabled = true;
+        force_zero_scaling = true;
+      };
+
       bindm = [
         "$mod, mouse:272, movewindow"
-        "$mod, Control_L, movewindow"
+        # "$mod, Control_L, movewindow"
         "$mod, mouse:273, resizewindow"
-        "$mod, ALT_L, resizewindow"
+        # "$mod, ALT_L, resizewindow"
+      ];
+
+      bind = [
         "$mod, mouse_down, workspace, e+1" # Scroll workspaces
         "$mod, mouse_up, workspace, e-1" # Scroll workspaces
-        ",mouse:275,exec,wl-copy $(wl-paste -p)" # copy selected text
-        ",mouse:276,exec,wtype -M ctrl -M shift v -m ctrl -m shift" # paste by Ctrl+Shift+
-      ];
-      bind = [
+        "$mod, mouse:275, exec, wl-copy $(wl-paste -p)" # copy selected text
+        "$mod, mouse:276, exec, wtype -M ctrl -M shift v -m ctrl -m shift" # paste by Ctrl+Shift+
+
         "ALTSHIFT, H, movewindow, l"
         "ALTSHIFT, L, movewindow, r"
         "ALTSHIFT, K, movewindow, u"
@@ -111,22 +123,24 @@
         "$mod, Q, killactive"
         "ALT, T, togglefloating"
         "ALT, F, fullscreen"
+        "$mod, Prior, exec, pkill -f kitty --class=kitty-logs || kitty --class='kitty-logs' journalctl -f"
         # Minimize window
         "ALT, M, movetoworkspace, special:magic"
         "ALT, M, togglespecialworkspace, magic"
 
-        "Control_R+Shift_R, B, exec, ~/bin/restore_minimized.sh Keybase"
-        "Control_R+Shift_R, X, exec, ~/bin/restore_minimized.sh org.kde.kwalletmanager5"
-        "Control_R+Shift_R, N, exec, ~/bin/restore_minimized.sh com.nextcloud.desktopclient.nextcloud"
-        "Control_R+Shift_R, Z, exec, ~/bin/restore_minimized.sh zen-alpha"
-        "Control_R+Shift_R, S, exec, ~/bin/restore_minimized.sh sublime_text"
-        # "$mod, R, exec, ~/bin/restore_minimized.sh org.kde.dolphin"
+        "Control_R+Shift_R, B, exec, ${config.home.homeDirectory}/bin/restore_minimized.sh Keybase"
+        "Control_R+Shift_R, X, exec, ${config.home.homeDirectory}/bin/restore_minimized.sh org.kde.kwalletmanager5"
+        "Control_R+Shift_R, N, exec, ${config.home.homeDirectory}/bin/restore_minimized.sh com.nextcloud.desktopclient.nextcloud"
+        "Control_R+Shift_R, Z, exec, ${config.home.homeDirectory}/bin/restore_minimized.sh zen-alpha"
+        "Control_R+Shift_R, S, exec, ${config.home.homeDirectory}/bin/restore_minimized.sh sublime_text"
+        # "$mod, R, exec, ${config.home.homeDirectory}/bin/restore_minimized.sh org.kde.dolphin"
 
 
         "$mod, F, exec, firefox"
         "$mod, Z, exec, zen"
         "$mod, k, exec, kitty"
         "$mod, G, exec, ghostty"
+        "$mod, Home, exec, firefox-developer-edition --new-tab http://webserver/jose/ --devtools"
         # "$mod+SHIFT_R, k, exec, pypr toggle term"
         "$mod+Shift_R, K, exec, konsole"
         "$mod, S, exec, subl"
@@ -182,29 +196,20 @@
         "ALTSHIFT, code:17, movetoworkspace, 8"
         "ALTSHIFT, code:18, movetoworkspace, 9"
         "ALTSHIFT, code:19, movetoworkspace, 10"
-
       ];
-        # ++ (
-        #   builtins.concatLists (builtins.genList (i:
-        #       let ws =i +1;
-        #       in [
-        #         "ALT code:1${toString i}, workspace, ${toString ws}"
-        #         "ALTSHIFT, code:1$i{toString i}, movetoworkspace, ${toString ws}"
-        #       ]
-        #     )
-        #   9)
-        # );
 
       cursor = {
         # no_hardware_cursors = true;
         default_monitor = "DP-5";
       };
+
       dwindle = {
           # See https://wiki.hyprland.org/Configuring/Dwindle-Layout/ for more
           pseudotile = true; # master switch for pseudotiling. Enabling is bound to mainMod + P in the keybinds section below
           preserve_split = true; # you probably want this
           special_scale_factor = 0.60; # Fix for floaters
       };
+
       monitor = [
         "DP-3, 1920x1080, 0x616, 1"
         "DP-5, 3840x2160, 1920x0, 1"
@@ -244,7 +249,7 @@
         # "${config.home.homeDirectory}/.config/hypr/scripts/start-in-tray.sh &"
         "${config.home.homeDirectory}/.config/hypr/scripts/start-keybase-gui.sh &"
         "input-remapper-control --command autoload &"
-        "/nix/store/yvnjx0qis4qdajc52f4xswhz4g69lcbn-polkit-kde-agent-1-6.2.5/libexec/polkit-kde-authentication-agent-1"
+        # "/nix/store/yvnjx0qis4qdajc52f4xswhz4g69lcbn-polkit-kde-agent-1-6.2.5/libexec/polkit-kde-authentication-agent-1"
 
         "[workspace 4 silent] remmina --enable-fullscreen -c /home/wayne/.local/share/remmina/group_rdp_xeon_xeon-local.remmina"
         # rdp://wayne@xeon.local"
@@ -271,21 +276,23 @@
         animation = [
           "windows,1,4,default,slide"
           "border,1,5,default"
-          "fadein,1,5,default"
-          "workspaces,1,3,default,vertslide"
+          "fadeIn,1,5,default"
+          "workspaces,1,3,default,slidevert"
         ];
       };
 
       decoration = {
         rounding = 8;
-        drop_shadow = true;
-        shadow_range = 60;
-        col.shadow = "0x66000000";
         blur = {
           enabled = true;
           size = 5;
           passes = 1;
         };
+        shadow = {
+          enabled = true;
+          range = 60;
+          color = "0x66000000";
+        };
       };
 
       layerrule = [
@@ -296,7 +303,6 @@
         "blur, waybar"
       ];
 
-
       windowrule = [
         # "opacity 0.9 override 0.5 override 0.8 override, ^(.*)$" # set opacity to 1.0 active, 0.5 inactive and 0.8 fullscreen for kitty
         # "opacity 1 override 0.9 override 0.9 override, ^(zen-alpha)$" # set opacity to 1.0 active, 0.5 inactive and 0.8 fullscreen for kitty
@@ -364,13 +370,32 @@
       ];
 
       input.numlock_by_default = true;
+
       debug.suppress_errors = true;
+
+      # plugin = {
+      #   hyprbars = {
+      #     hyprbars-button = [
+      #       "rgb(ff4040), 10, 󰖭, hyprctl dispatch killactive"
+      #       "rgb(eeee11), 10, , hyprctl dispatch fullscreen 1"
+      #       "rgb(00ff00), 10, 󰍈, hyprctl dispatch togglefloating"
+      #     ];
+      #     bar_height = 20;
+      #     bar_blur = 0.5;
+      #     bar_button_padding = 10;
+      #     icon_on_hover = true;
+      #   };
+      #   hyprwinwrap = {
+      #     class = "kitty-logs";
+      #     # || class = "ff-dev";
+      #   };
+      # };
     };
 
-
     # extraConfig = {};
-    plugins = [
-      pkgs.hyprlandPlugins.hyprbars
-    ];
+    # plugins = [
+    #   pkgs.hyprlandPlugins.hyprbars
+    #   pkgs.hyprlandPlugins.hyprwinwrap
+    # ];
   };
 }
diff --git a/secrets/ff-sync.age b/secrets/ff-sync.age
new file mode 100644
index 0000000..0423076
--- /dev/null
+++ b/secrets/ff-sync.age
@@ -0,0 +1,17 @@
+age-encryption.org/v1
+-> ssh-rsa 28puQg
+lVwHDCRofemRbFtwuqso096ONdhbSBWO4NcQCwuUQZBa9WsW2WzcWlEIN7q2nRxG
+dGGcDvV1bXRz5JAdkUmTQ1T2ane5vd+d5cglOREftVA8Xxw+9lfC2T7fSoAWxNwh
+iCQ8+JKiDM8crFf2dW0dxTBYiLqINtHx4rgyfBDlug1K5XlHgUNCaV2Z9zfhyHrO
+BAlFOUWNZiQspYM8t42sIClm5dEyu0fqXNOE/MFvY3F3tMCU1EI3j80Pbji7T0JT
+Mfd1eR33Ee/xK9Nivp3GYoKEcpXNzesOdzb87OvEXaZNGq7OQ4Bjc4xcLj7wjgLn
+V9N6pfYeP4k6jccVor9yHLhZDzEKXECN4VGi53xliDqRUn0/oegvAgggp4qGCAl1
+w1EAbV2XgrlX9TlcQhH9lGXgXmk19lhs40IGg//utB9rI16+2sJtaECx7nfwAP70
+kCTcyc3GnOqlccrHNGDpYX4lqnUvPcyn73xWBkU7a6mgqTaG05lQMm03A9x9jfQR
+39AD42KKPpNoGZj+bVJeNga3X304hNt+UMZR0pdLd48fUkFBNnCI1LK8bJbZltsN
+r8LkF5akSihU/v80tnYpIU6MHEl48CSW3vZk+D/swKdDVVvWO/bGKl/YgIGGzUq2
+4VoAfvxfwClqiUxWs202SBit0On+C2oV+E/hp5jF1wM
+-> ssh-ed25519 rxYdLA YgKhuiaoQLtRqOQaTXuoq+8WPlppAozXd1WHgDihOiI
+W8Uw86kUq93y/6Ogs5Z+4VZvLOK2ujL7spnLqRrJOuQ
+--- kXUATZBxfjNOwb4/6TW9wWrfUjl8AQhUYIDh8uoOmpQ
+DkgbY:WkD]&]KYeFu
\ No newline at end of file
diff --git a/system.sublime-workspace b/system.sublime-workspace
index ffa08dd..b564d67 100644
--- a/system.sublime-workspace
+++ b/system.sublime-workspace
@@ -3,6 +3,14 @@
 	{
 		"selected_items":
 		[
+			[
+				"li",
+				"libsForQt5"
+			],
+			[
+				"Us",
+				"USER"
+			],
 			[
 				"p",
 				"pkgs"
@@ -336,34 +344,25 @@
 			"file": "configuration.nix",
 			"settings":
 			{
-				"buffer_size": 9213,
+				"buffer_size": 9129,
 				"encoding": "UTF-8",
 				"line_ending": "Unix"
 			}
 		},
 		{
-			"file": "environment.nix",
+			"file": "pkgs/virtualisation/vbox.nix",
 			"settings":
 			{
-				"buffer_size": 2466,
+				"buffer_size": 463,
 				"encoding": "UTF-8",
 				"line_ending": "Unix"
 			}
 		},
 		{
-			"file": "hm/modules/theme.nix",
+			"file": "modules/users.nix",
 			"settings":
 			{
-				"buffer_size": 1585,
-				"encoding": "UTF-8",
-				"line_ending": "Unix"
-			}
-		},
-		{
-			"file": "modules/security.nix",
-			"settings":
-			{
-				"buffer_size": 3147,
+				"buffer_size": 1533,
 				"encoding": "UTF-8",
 				"line_ending": "Unix"
 			}
@@ -372,7 +371,7 @@
 			"file": "hm/home.nix",
 			"settings":
 			{
-				"buffer_size": 7145,
+				"buffer_size": 7656,
 				"encoding": "UTF-8",
 				"line_ending": "Unix"
 			}
@@ -404,37 +403,21 @@
 				"line_ending": "Unix"
 			}
 		},
-		{
-			"file": "modules/containers.nix",
-			"settings":
-			{
-				"buffer_size": 38088,
-				"encoding": "UTF-8",
-				"line_ending": "Unix"
-			}
-		},
 		{
 			"file": "pkgs/wm/hyprland/hyprland.nix",
 			"settings":
 			{
-				"buffer_size": 14249,
+				"buffer_size": 15139,
 				"encoding": "UTF-8",
 				"line_ending": "Unix"
 			}
 		},
 		{
-			"file": "/home/wayne/.local/share/remmina/group_rdp_xeon_xeon-local.remmina",
+			"file": "/home/wayne/.config/hypr/hyprland.conf",
 			"settings":
 			{
-				"buffer_size": 1583,
-				"line_ending": "Unix"
-			}
-		},
-		{
-			"file": "/home/wayne/.config/systemd/user/blueman-applet.service",
-			"settings":
-			{
-				"buffer_size": 283,
+				"buffer_size": 8038,
+				"encoding": "UTF-8",
 				"line_ending": "Unix"
 			}
 		},
@@ -448,12 +431,12 @@
 			}
 		},
 		{
-			"contents": "bqawethttps://www.slowcookerclub.com/slow-cooker-roast-beef/\nhttps://www.delish.com/cooking/recipe-ideas/a44690466/shredded-beef-recipe/\nhttps://www.slowcookerclub.com/slow-cooker-steak-pie/\nbb00\n3,249,164,898,304 2.95TB\n\nhttps://www.alphavantage.co/query?function=HISTORICAL_OPTIONS&symbol=SILVER&date=2017-11-15&apikey=4JLXS4COZJMQR98O\n\nhttps://finance.yahoo.com/quote/%5EGSPC/options/\n\nhttps://www.lacare.org/sites/default/files/la0262_mcla_eoc_en_202501.pdf\n\nhttps://www.healthcareoptions.dhcs.ca.gov/en/health-plan-materials?counties=Los%20Angeles\n\nhttps://providers.lacare.org/v3app/a/?6713520D04184E241C0D022C0D340C1C130C11166E221738130C0A0C1737483A0002181B1A18225A403E370111000A0017361A331706230A171747040051305A030F4913504C5F0E5D235D01440F5D06405E75415663485C5B08015E122E575255120E50400B585A3848140C1C130C11162816582C06570F0B0447182948075E16504C4058565F685751080A4857114B7744016303000D5F550E07201D1205135557405C565E755C5E5A5B57273632724B516626235D282747190A262429355C5144512A57702C522B/#comparisonPage\n\nhttps://download.lineage.microg.org/river/\n\n",
+			"contents": "bqawet\nhttps://www.slowcookerclub.com/slow-cooker-roast-beef/\nhttps://www.delish.com/cooking/recipe-ideas/a44690466/shredded-beef-recipe/\nhttps://www.slowcookerclub.com/slow-cooker-steak-pie/\nbb00\n3,249,164,898,304 2.95TB\n\nhttps://www.alphavantage.co/query?function=HISTORICAL_OPTIONS&symbol=SILVER&date=2017-11-15&apikey=4JLXS4COZJMQR98O\n\nhttps://finance.yahoo.com/quote/%5EGSPC/options/\n\nhttps://www.lacare.org/sites/default/files/la0262_mcla_eoc_en_202501.pdf\n\nhttps://www.healthcareoptions.dhcs.ca.gov/en/health-plan-materials?counties=Los%20Angeles\n\nhttps://providers.lacare.org/v3app/a/?6713520D04184E241C0D022C0D340C1C130C11166E221738130C0A0C1737483A0002181B1A18225A403E370111000A0017361A331706230A171747040051305A030F4913504C5F0E5D235D01440F5D06405E75415663485C5B08015E122E575255120E50400B585A3848140C1C130C11162816582C06570F0B0447182948075E16504C4058565F685751080A4857114B7744016303000D5F550E07201D1205135557405C565E755C5E5A5B57273632724B516626235D282747190A262429355C5144512A57702C522B/#comparisonPage\n\nhttps://download.lineage.microg.org/river/\n\n",
 			"settings":
 			{
-				"buffer_size": 1056,
+				"buffer_size": 1057,
 				"line_ending": "Unix",
-				"name": "bqawethttps://www.slowcookerclub.com/slow-cooker-r"
+				"name": "bqawet"
 			}
 		},
 		{
@@ -466,13 +449,11 @@
 			}
 		},
 		{
-			"contents": "#!/usr/bin/env bash\n\nMAX_RETRIES=10\n\ncheck_wallet() {\n  retry_count=0\n  while ! dbus-send --session --dest=org.kde.kwalletd5 --type=method_call --print-reply /modules/kwalletd5 org.kde.KWallet.wallets | grep -q \"kdewallet\"; do\n    echo \"Waiting for KWallet to unlock...\"\n    sleep 5\n    retry_count=$((retry_count+1))\n    if [ \"$retry_count\" -ge \"$MAX_RETRIES\" ]; then\n      echo \"Failed to detect unlocked KWallet after $MAX_RETRIES attempts.\"\n      exit 1\n    fi\n  done\n  echo \"KWallet is unlocked\"\n  is_loggedin\n}\n\nis_loggedin() {\n  retry_count=0\n  while ! keybase login; do\n    echo \"Keybase login failed, retrying...\"\n    /home/wayne/.nix-profile/bin/keybase login\n    sleep 5\n    retry_count=$((retry_count+1))\n    if [ \"$retry_count\" -ge \"$MAX_RETRIES\" ]; then\n      echo \"Failed to log in to Keybase after $MAX_RETRIES attempts.\"\n      exit 1\n    fi\n  done\n  echo \"Keybase is logged in\"\n  is_mounted\n}\n\nis_mounted() {\n  retry_count=0\n  while ! mount | grep kbfs > /dev/null; do\n    echo \"Mounting kbfs...\"\n    /home/wayne/.nix-profile/bin/kbfsfuse &\n    sleep 2\n    retry_count=$((retry_count+1))\n    if [ \"$retry_count\" -ge \"$MAX_RETRIES\" ]; then\n      echo \"Failed to mount kbfs after $MAX_RETRIES attempts.\"\n      exit 1\n    fi\n  done\n  echo \"kbfs is mounted\"\n  launch_gui\n}\n\nlaunch_gui() {\n  # echo \"Launching Keybase GUI...\"\n  # /home/wayne/.nix-profile/bin/keybase-gui %u &\n  # sleep 10\n  # /run/current-system/sw/bin/input-remapper-control --config-dir /home/wayne/.config/input-remapper-2/ --device 'Hanvon Ugee Shortcut Remote' --preset 'new preset 10' &\n}\n\ncheck_wallet\n\n\nif (action.id == \"org.freedesktop.policykit.exec\" && subject.isInGroup(\"wheel\")) {\n          return polkit.Result.YES;\n        }",
 			"file": "/home/wayne/.config/hypr/scripts/start-keybase-gui.sh",
-			"file_size": 1588,
-			"file_write_time": 133880983808839978,
 			"settings":
 			{
-				"buffer_size": 1718,
+				"buffer_size": 1579,
+				"encoding": "UTF-8",
 				"line_ending": "Unix"
 			}
 		},
@@ -561,29 +542,59 @@
 		"/home/wayne/system",
 		"/home/wayne/system/hm",
 		"/home/wayne/system/hm/modules",
+		"/home/wayne/system/hm/pkgs",
 		"/home/wayne/system/modules",
 		"/home/wayne/system/nix",
 		"/home/wayne/system/pkgs",
 		"/home/wayne/system/pkgs/app",
 		"/home/wayne/system/pkgs/app/terminal",
-		"/home/wayne/system/pkgs/wm"
+		"/home/wayne/system/pkgs/app/utilities",
+		"/home/wayne/system/pkgs/virtualisation"
 	],
 	"file_history":
 	[
+		"/home/wayne/system/modules/containers.nix",
+		"/home/wayne/bin/rsync.sh",
+		"/home/wayne/bin/nextcloud_backup.sh",
+		"/home/wayne/system/hm/pkgs/inet-packages.nix",
+		"/home/wayne/system/hm/pkgs/default.nix",
+		"/home/wayne/system/modules/display.nix",
+		"/home/wayne/system/hardware-configuration.nix",
+		"/home/wayne/.ssh/config",
+		"/home/wayne/system/hm/pkgs/utilities.nix",
+		"/home/wayne/.dotfiles/home/bin/bin/rsync.sh",
+		"/home/wayne/backups/mail/2025/04/soldmyemail.info-full_20250424-000.snar",
+		"/home/wayne/.ssh/id_rsa",
+		"/home/wayne/.ssh/authorized_keys",
+		"/home/wayne/test/backup.sh",
+		"/home/wayne/test/backup.sh.bak",
+		"/home/wayne/sshfs/bin/backup.sh",
+		"/home/wayne/test/test/backup/2025/04/logs/img2img-images-incremental_20250426-002.log",
+		"/home/wayne/test/test/backup/2025/04/logs/extras-images-full_20250426-000.log",
+		"/home/wayne/test/test/backup/2025/04/logs/extras-images-incremental_20250426-001.log",
+		"/home/wayne/.dotfiles/home/config/.config/mpv/config.inc",
+		"/home/wayne/.dotfiles/home/bin/bin/removeGenerations.sh",
+		"/home/wayne/.local/share/remmina/group_rdp_xeon_xeon-local.remmina",
+		"/home/wayne/system/hm/pkgs/theme-packages.nix",
+		"/home/wayne/system/environment.nix",
+		"/home/wayne/system/modules/security.nix",
+		"/home/wayne/system/secrets/secrets.nix",
+		"/home/wayne/system/pkgs/app/servers/ff-sync.nix",
 		"/home/wayne/system/hm/modules/hypr.nix",
+		"/home/wayne/.config/wlogout/layout",
+		"/home/wayne/system/pkgs/app/terminal/ghostty.nix",
+		"/home/wayne/.config/systemd/user/blueman-applet.service",
+		"/home/wayne/system/pkgs/virtualisation/vbox.nix",
+		"/etc/systemd/system/polkit.service",
+		"/etc/systemd/system/polkit.service.d/overrides.conf",
+		"/home/wayne/system/pkgs/virtualisation/docker.nix",
+		"/home/wayne/system/hm/modules/theme.nix",
 		"/home/wayne/system/hm/pkgs/wm-packages.nix",
 		"/home/wayne/system/modules/users.nix",
-		"/home/wayne/system/hardware-configuration.nix",
 		"/home/wayne/bin/hypr-logout.sh",
 		"/home/wayne/tmp/hypr/hyprexitwithgrace.log",
 		"/home/wayne/system/pkgs/app/utilities/bacula.nix",
-		"/home/wayne/system/hm/pkgs/utilities.nix",
-		"/home/wayne/system/hm/pkgs/default.nix",
 		"/home/wayne/system/modules/nix.nix",
-		"/home/wayne/system/pkgs/virtualisation/docker.nix",
-		"/home/wayne/system/pkgs/app/terminal/ghostty.nix",
-		"/home/wayne/system/hm/pkgs/inet-packages.nix",
-		"/home/wayne/system/pkgs/virtualisation/vbox.nix",
 		"/home/wayne/system/modules/files.nix",
 		"/home/wayne/.config/autostart/keybase_autostart.desktop",
 		"/home/wayne/.config/waybar/style.css",
@@ -602,15 +613,11 @@
 		"/home/wayne/invokeai/databases/invokeai.db",
 		"/home/wayne/invokeai/configs/models.yaml",
 		"/home/wayne/invokeai/invokeai.yaml",
-		"/home/wayne/system/modules/security.nix",
-		"/home/wayne/system/hm/pkgs/theme-packages.nix",
 		"/home/wayne/system/modules/wm.nix",
 		"/home/wayne/system/hm/pkgs/llm-packages.nix",
 		"/home/wayne/.config/hypr/scripts/start-in-tray.sh",
 		"/home/wayne/.config/hypr/scripts/start-keybase-gui.sh",
-		"/home/wayne/system/environment.nix",
 		"/home/wayne/system/hm/modules/wayland.nix",
-		"/home/wayne/system/hm/modules/theme.nix",
 		"/home/wayne/.config/ghostty/config",
 		"/home/wayne/system/modules/network.nix",
 		"/home/wayne/.config/kiorc",
@@ -619,7 +626,6 @@
 		"/home/wayne/system/hm/pkgs/creative-packages.nix",
 		"/etc/nixos/smb-secrets",
 		"/home/wayne/.config/hypr/hyprland.conf",
-		"/home/wayne/system/modules/containers.nix",
 		"/home/wayne/system/pkgs/wm/waybar/default.json",
 		"/home/wayne/system/pkgs/wm/waybar/waybar.nix",
 		"/home/wayne/system/hm/home.nix",
@@ -632,8 +638,6 @@
 		"/home/wayne/dev/www/lms-aider/.gitignore",
 		"/home/wayne/dev/www/lms-aider/.envrc",
 		"/home/wayne/system/modules/musnix.nix",
-		"/home/wayne/system/modules/display.nix",
-		"/home/wayne/.ssh/config",
 		"/home/wayne/dev/www/maelstrom/maelstrom/admin/package.json",
 		"/home/wayne/dev/www/maelstrom/maelstrom/admin/README",
 		"/home/wayne/Flakes/flake-aider-chat/README.md",
@@ -669,7 +673,6 @@
 		"/home/wayne/dev/www/whc/2025/dashboard/ticker.html",
 		"/home/wayne/dev/www/whc/2025/dashboard/js/calendar.js",
 		"/home/wayne/system/pkgs/shell/cli-collection.nix",
-		"/home/wayne/.local/share/remmina/group_rdp_xeon_xeon-local.remmina",
 		"/home/wayne/.config/remmina/remmina.pref",
 		"/home/wayne/.config/cava/config",
 		"/home/wayne/system/modules/tailscale.nix",
@@ -677,31 +680,11 @@
 		"/home/wayne/dev/whd/live/public/index.php_",
 		"/home/wayne/dev/whd/live/public/tpl/services.tpl.php",
 		"/home/wayne/dev/whd/live/public/tpl/head.tpl.php",
-		"/home/wayne/dev/whd/live/public/tpl/hero.tpl.php",
-		"/home/wayne/dev/whd/live/public/inc/analytics.inc.php",
-		"/home/wayne/dev/whd/live/public/robots.txt",
-		"/home/wayne/.dotfiles/home/bin/bin/my-tmux.sh",
-		"/home/wayne/dev/whd/live/waynehayesdevelopment.com.conf",
-		"/home/wayne/dev/whd/live/nextcloud.conf",
-		"/home/wayne/dev/whd/live/wiki.waynehayesdevelopment.com.conf",
-		"/home/wayne/dev/whd/live/default",
-		"/home/wayne/.config/wlogout/style.css",
-		"/home/wayne/system/secrets/secrets.nix",
-		"/home/wayne/.viminfo",
-		"/home/wayne/.config/nvim/init.lua",
-		"/home/wayne/.config/nvim/lua/vim-options.lua",
-		"/home/wayne/.config/nvim/lua/plugins.lua",
-		"/home/wayne/.config/nvim/lua/plugins/copilot.lua",
-		"/home/wayne/dev/whd/live/public/humans.txt",
-		"/home/wayne/dev/whd/live/public/assets/img/logo-purple.svg",
-		"/home/wayne/dev/whd/live/office.conf",
-		"/home/wayne/.config/tmux/tmux.conf",
-		"/home/wayne/.tmux.conf",
-		"/home/wayne/dev/crud-bots/fastapi-beanie-jwt/shell.nix"
+		"/home/wayne/dev/whd/live/public/tpl/hero.tpl.php"
 	],
 	"find":
 	{
-		"height": 41.0
+		"height": 35.0
 	},
 	"find_in_files":
 	{
@@ -718,6 +701,17 @@
 		"case_sensitive": false,
 		"find_history":
 		[
+			"extraConfig",
+			"phpfpm",
+			"mpv",
+			"sound",
+			"remote",
+			"input-remapper",
+			"qtwayland",
+			"polkit",
+			"kdePackages.wayland",
+			"wayland",
+			"qt",
 			"polkit",
 			"ksudoku",
 			"kio",
@@ -834,18 +828,7 @@
 			"systemd",
 			"touch",
 			"i2c",
-			"kitty",
-			"zsh",
-			"home",
-			"flake",
-			"wayland",
-			"ssh",
-			"agenix",
-			"TIME",
-			".png",
-			"wallpapers",
-			"hyprpaper",
-			"eth0"
+			"kitty"
 		],
 		"highlight": false,
 		"in_selection": false,
@@ -937,10 +920,10 @@
 							"translate_tabs_to_spaces": true
 						},
 						"translation.x": 0.0,
-						"translation.y": 4536.0,
+						"translation.y": 0.0,
 						"zoom_level": 1.0
 					},
-					"stack_index": 7,
+					"stack_index": 2,
 					"type": "text"
 				},
 				{
@@ -949,15 +932,15 @@
 					"semi_transient": false,
 					"settings":
 					{
-						"buffer_size": 9213,
+						"buffer_size": 9129,
 						"regions":
 						{
 						},
 						"selection":
 						[
 							[
-								8520,
-								8520
+								2262,
+								2262
 							]
 						],
 						"settings":
@@ -967,27 +950,27 @@
 							"translate_tabs_to_spaces": true
 						},
 						"translation.x": 0.0,
-						"translation.y": 8313.0,
+						"translation.y": 585.0,
 						"zoom_level": 1.0
 					},
-					"stack_index": 4,
+					"stack_index": 7,
 					"type": "text"
 				},
 				{
 					"buffer": 2,
-					"file": "environment.nix",
+					"file": "pkgs/virtualisation/vbox.nix",
 					"semi_transient": false,
 					"settings":
 					{
-						"buffer_size": 2466,
+						"buffer_size": 463,
 						"regions":
 						{
 						},
 						"selection":
 						[
 							[
-								1036,
-								1036
+								265,
+								265
 							]
 						],
 						"settings":
@@ -1000,24 +983,24 @@
 						"translation.y": 0.0,
 						"zoom_level": 1.0
 					},
-					"stack_index": 3,
+					"stack_index": 1,
 					"type": "text"
 				},
 				{
 					"buffer": 3,
-					"file": "hm/modules/theme.nix",
+					"file": "modules/users.nix",
 					"semi_transient": false,
 					"settings":
 					{
-						"buffer_size": 1585,
+						"buffer_size": 1533,
 						"regions":
 						{
 						},
 						"selection":
 						[
 							[
-								593,
-								593
+								627,
+								627
 							]
 						],
 						"settings":
@@ -1035,49 +1018,19 @@
 				},
 				{
 					"buffer": 4,
-					"file": "modules/security.nix",
-					"semi_transient": false,
-					"settings":
-					{
-						"buffer_size": 3147,
-						"regions":
-						{
-						},
-						"selection":
-						[
-							[
-								1749,
-								1749
-							]
-						],
-						"settings":
-						{
-							"syntax": "Packages/Nix/nix.tmLanguage",
-							"tab_size": 2,
-							"translate_tabs_to_spaces": true
-						},
-						"translation.x": 0.0,
-						"translation.y": 0.0,
-						"zoom_level": 1.0
-					},
-					"stack_index": 1,
-					"type": "text"
-				},
-				{
-					"buffer": 5,
 					"file": "hm/home.nix",
 					"semi_transient": false,
 					"settings":
 					{
-						"buffer_size": 7145,
+						"buffer_size": 7656,
 						"regions":
 						{
 						},
 						"selection":
 						[
 							[
-								5799,
-								5799
+								821,
+								821
 							]
 						],
 						"settings":
@@ -1086,15 +1039,15 @@
 							"tab_size": 2,
 							"translate_tabs_to_spaces": true
 						},
-						"translation.x": 0.0,
-						"translation.y": 288.0,
+						"translation.x": 118.0,
+						"translation.y": 3627.0,
 						"zoom_level": 1.0
 					},
-					"stack_index": 5,
+					"stack_index": 4,
 					"type": "text"
 				},
 				{
-					"buffer": 6,
+					"buffer": 5,
 					"file": "pkgs/wm/waybar/default.json",
 					"semi_transient": false,
 					"settings":
@@ -1120,11 +1073,11 @@
 						"translation.y": 3951.0,
 						"zoom_level": 1.0
 					},
-					"stack_index": 8,
+					"stack_index": 6,
 					"type": "text"
 				},
 				{
-					"buffer": 7,
+					"buffer": 6,
 					"file": "pkgs/wm/waybar/waybar.nix",
 					"semi_transient": false,
 					"settings":
@@ -1146,15 +1099,15 @@
 							"tab_size": 2,
 							"translate_tabs_to_spaces": true
 						},
-						"translation.x": 244.0,
-						"translation.y": 1889.0,
+						"translation.x": 693.0,
+						"translation.y": 0.0,
 						"zoom_level": 1.0
 					},
-					"stack_index": 9,
+					"stack_index": 8,
 					"type": "text"
 				},
 				{
-					"buffer": 8,
+					"buffer": 7,
 					"file": "/home/wayne/.config/waybar/style.css",
 					"semi_transient": false,
 					"settings":
@@ -2017,46 +1970,46 @@
 						"translation.y": 0.0,
 						"zoom_level": 1.0
 					},
-					"stack_index": 15,
+					"stack_index": 13,
 					"type": "text"
 				},
 				{
-					"buffer": 9,
-					"file": "modules/containers.nix",
+					"buffer": 8,
+					"file": "pkgs/wm/hyprland/hyprland.nix",
 					"semi_transient": false,
 					"settings":
 					{
-						"buffer_size": 38088,
+						"buffer_size": 15139,
 						"regions":
 						{
 						},
 						"selection":
 						[
 							[
-								3644,
-								3644
+								14993,
+								14455
 							]
 						],
 						"settings":
 						{
 							"syntax": "Packages/Nix/nix.tmLanguage",
-							"tab_size": 2,
+							"tab_size": 4,
 							"translate_tabs_to_spaces": true
 						},
 						"translation.x": 0.0,
-						"translation.y": 0.0,
+						"translation.y": 9711.0,
 						"zoom_level": 1.0
 					},
-					"stack_index": 12,
+					"stack_index": 5,
 					"type": "text"
 				},
 				{
-					"buffer": 10,
-					"file": "pkgs/wm/hyprland/hyprland.nix",
+					"buffer": 9,
+					"file": "/home/wayne/.config/hypr/hyprland.conf",
 					"semi_transient": false,
 					"settings":
 					{
-						"buffer_size": 14249,
+						"buffer_size": 8038,
 						"regions":
 						{
 							"col_66000001":
@@ -2065,8 +2018,8 @@
 								"regions":
 								[
 									[
-										10524,
-										10534
+										592,
+										602
 									]
 								],
 								"scope": "col_66000001"
@@ -2078,8 +2031,8 @@
 								"regions":
 								[
 									[
-										10502,
-										10502
+										582,
+										582
 									]
 								],
 								"scope": "col_gutter"
@@ -2088,81 +2041,25 @@
 						"selection":
 						[
 							[
-								8806,
-								8806
+								1533,
+								1533
 							]
 						],
 						"settings":
 						{
-							"syntax": "Packages/Nix/nix.tmLanguage",
+							"syntax": "Packages/Text/Plain text.tmLanguage",
 							"tab_size": 2,
 							"translate_tabs_to_spaces": true
 						},
 						"translation.x": 0.0,
-						"translation.y": 9261.0,
+						"translation.y": 1584.0,
 						"zoom_level": 1.0
 					},
-					"stack_index": 6,
+					"stack_index": 12,
 					"type": "text"
 				},
 				{
-					"buffer": 11,
-					"file": "/home/wayne/.local/share/remmina/group_rdp_xeon_xeon-local.remmina",
-					"semi_transient": false,
-					"settings":
-					{
-						"buffer_size": 1583,
-						"regions":
-						{
-						},
-						"selection":
-						[
-							[
-								0,
-								0
-							]
-						],
-						"settings":
-						{
-							"syntax": "Packages/Text/Plain text.tmLanguage"
-						},
-						"translation.x": 0.0,
-						"translation.y": 0.0,
-						"zoom_level": 1.0
-					},
-					"stack_index": 13,
-					"type": "text"
-				},
-				{
-					"buffer": 12,
-					"file": "/home/wayne/.config/systemd/user/blueman-applet.service",
-					"semi_transient": false,
-					"settings":
-					{
-						"buffer_size": 283,
-						"regions":
-						{
-						},
-						"selection":
-						[
-							[
-								141,
-								141
-							]
-						],
-						"settings":
-						{
-							"syntax": "Packages/Text/Plain text.tmLanguage"
-						},
-						"translation.x": 0.0,
-						"translation.y": 0.0,
-						"zoom_level": 1.0
-					},
-					"stack_index": 14,
-					"type": "text"
-				},
-				{
-					"buffer": 13,
+					"buffer": 10,
 					"semi_transient": false,
 					"settings":
 					{
@@ -2173,8 +2070,8 @@
 						"selection":
 						[
 							[
-								564,
-								564
+								688,
+								688
 							]
 						],
 						"settings":
@@ -2186,45 +2083,45 @@
 						"translation.y": 0.0,
 						"zoom_level": 1.0
 					},
-					"stack_index": 16,
+					"stack_index": 11,
 					"type": "text"
 				}
 			]
 		},
 		{
-			"selected": 2,
+			"selected": 0,
 			"sheets":
 			[
 				{
-					"buffer": 14,
+					"buffer": 11,
 					"semi_transient": false,
 					"settings":
 					{
-						"buffer_size": 1056,
+						"buffer_size": 1057,
 						"regions":
 						{
 						},
 						"selection":
 						[
 							[
-								1056,
-								1056
+								464,
+								464
 							]
 						],
 						"settings":
 						{
-							"auto_name": "bqawethttps://www.slowcookerclub.com/slow-cooker-r",
+							"auto_name": "bqawet",
 							"syntax": "Packages/Text/Plain text.tmLanguage"
 						},
 						"translation.x": 0.0,
-						"translation.y": 0.0,
+						"translation.y": 234.0,
 						"zoom_level": 1.0
 					},
-					"stack_index": 11,
+					"stack_index": 3,
 					"type": "text"
 				},
 				{
-					"buffer": 15,
+					"buffer": 12,
 					"file": "/home/wayne/.config/hypr/scripts/start-in-tray.sh",
 					"semi_transient": false,
 					"settings":
@@ -2242,30 +2139,32 @@
 						],
 						"settings":
 						{
-							"syntax": "Packages/ShellScript/Bash.sublime-syntax"
+							"syntax": "Packages/ShellScript/Bash.sublime-syntax",
+							"tab_size": 2,
+							"translate_tabs_to_spaces": true
 						},
 						"translation.x": 0.0,
-						"translation.y": 0.0,
+						"translation.y": 234.0,
 						"zoom_level": 1.0
 					},
 					"stack_index": 10,
 					"type": "text"
 				},
 				{
-					"buffer": 16,
+					"buffer": 13,
 					"file": "/home/wayne/.config/hypr/scripts/start-keybase-gui.sh",
 					"semi_transient": false,
 					"settings":
 					{
-						"buffer_size": 1718,
+						"buffer_size": 1579,
 						"regions":
 						{
 						},
 						"selection":
 						[
 							[
-								1718,
-								1718
+								0,
+								0
 							]
 						],
 						"settings":
@@ -2275,10 +2174,10 @@
 							"translate_tabs_to_spaces": true
 						},
 						"translation.x": 0.0,
-						"translation.y": 1888.0,
+						"translation.y": 0.0,
 						"zoom_level": 1.0
 					},
-					"stack_index": 2,
+					"stack_index": 9,
 					"type": "text"
 				}
 			]
@@ -2288,7 +2187,7 @@
 			"sheets":
 			[
 				{
-					"buffer": 17,
+					"buffer": 14,
 					"semi_transient": false,
 					"settings":
 					{
@@ -2314,7 +2213,7 @@
 						"translation.y": 0.0,
 						"zoom_level": 1.0
 					},
-					"stack_index": 17,
+					"stack_index": 14,
 					"type": "text"
 				}
 			]
@@ -2322,7 +2221,7 @@
 	],
 	"incremental_find":
 	{
-		"height": 41.0
+		"height": 35.0
 	},
 	"input":
 	{
@@ -2360,7 +2259,7 @@
 		"rows":
 		[
 			0.0,
-			0.829657072007,
+			0.814967220894,
 			1.0
 		]
 	},
@@ -2381,7 +2280,7 @@
 	"project": "system.sublime-project",
 	"replace":
 	{
-		"height": 78.0
+		"height": 66.0
 	},
 	"save_all_on_build": true,
 	"select_file":
@@ -2401,7 +2300,7 @@
 		[
 			[
 				"",
-				"~/dev/www/stock-ticker/Stock-Ticker.sublime-workspace"
+				"~/dev/laravel/budget-app/Laravel_Budget-app.sublime-project"
 			]
 		],
 		"width": 380.0
@@ -2423,7 +2322,7 @@
 	"show_open_files": false,
 	"show_tabs": true,
 	"side_bar_visible": true,
-	"side_bar_width": 205.0,
+	"side_bar_width": 154.0,
 	"status_bar_visible": true,
 	"template_settings":
 	{